Spaces:

GlyphByT5
/

ART_v1.0

Sleeping

App Files Files Community

WYBar commited on 4 days ago

Commit

8fe62ee

1 Parent(s): 7dfddd5

finish with token

Browse files

Files changed (11) hide show

README.md +2 -2
app.py +684 -0
app_test.py +684 -0
config/base.py +31 -0
config/v04sv03_lora_r64_upto50layers_bs1_lr1_prodigy_800k_wds_512_filtered_10ep_none_8gpu.py +111 -0
custom_model_mmdit.py +334 -0
custom_model_transp_vae.py +331 -0
custom_pipeline.py +845 -0
modeling_crello.py +235 -0
quantizer.py +552 -0
requirements.txt +46 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: ART V1.0
-emoji: ⚡
 colorFrom: gray
-colorTo: red
 sdk: gradio
 sdk_version: 5.20.0
 app_file: app.py

 ---
 title: ART V1.0
+emoji: 📊
 colorFrom: gray
+colorTo: indigo
 sdk: gradio
 sdk_version: 5.20.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,684 @@

+import os
+import spaces
+import ast
+import numpy as np
+from functools import partial
+import torch
+import torch.utils.checkpoint
+from PIL import Image
+import xml.etree.cElementTree as ET
+from io import BytesIO
+import base64
+import json
+import gradio as gr
+from functools import partial
+import requests
+import base64
+import os
+import time
+import re
+from transformers import (
+    AutoTokenizer,
+    set_seed
+)
+from typing import List
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList, \
+    STOPPING_CRITERIA_INPUTS_DOCSTRING, add_start_docstrings
+class StopAtSpecificTokenCriteria(StoppingCriteria):
+    def __init__(self, token_id_list: List[int] = None):
+        self.token_id_list = token_id_list
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        return input_ids[0][-1].detach().cpu().numpy() in self.token_id_list
+def ensure_space_after_period(input_string):
+    # 去除多余的空格
+    output_string = re.sub(r'\.\s*', '. ', input_string)
+    return output_string
+def generate_unique_filename():
+    # 生成一个基于时间戳和随机数的唯一文件名
+    timestamp = int(time.time() * 1000)  # 时间戳，毫秒级
+    # random_num = random.randint(1000, 9999)  # 随机数
+    unique_filename = f"{timestamp}"
+    return unique_filename
+def upload_to_github(file_path,
+                     repo='WYBar/gradiodemo_svg',
+                     branch='main',
+                     token='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'):
+    if not os.path.isfile(file_path):
+        print(f"File not found: {file_path}")
+        return None
+    with open(file_path, 'rb') as file:
+        content = file.read()
+    encoded_content = base64.b64encode(content).decode('utf-8')
+    unique_filename = generate_unique_filename()
+    url = f"https://api.github.com/repos/{repo}/contents/{unique_filename}.svg"
+    headers = {
+        "Authorization": f"token {token}"
+    }
+    response = requests.get(url, headers=headers)
+    sha = None
+    if response.status_code == 200:
+        sha = response.json()['sha']
+    elif response.status_code == 404:
+        # 文件不存在，不需要SHA
+        pass
+    else:
+        print(f"Failed to get file status: {response.status_code}")
+        # print(response.text)
+        return None
+    headers = {
+        "Authorization": f"token {token}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "message": "upload svg file",
+        "content": encoded_content,
+        "branch": branch
+    }
+    if sha:
+        # 文件存在，更新文件
+        # print('sha exists, update the old one')
+        data["sha"] = sha
+        response = requests.put(url, headers=headers, json=data)
+    else:
+        # 文件不存在，创建新文件
+        print("sha not exist, need to create a new one")
+        response = requests.put(url, headers=headers, json=data)
+    # print(response.status_code)
+    # print(response.text)
+    if response.status_code in [200, 201]:
+        # print(response.json()['content']['download_url'])
+        return response.json()['content']['download_url'], unique_filename
+    else:
+        print("None")
+        return None
+def calculate_iou(box1, box2):
+    # 计算两个框的交集
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
+    # 计算两个框的并集
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union_area = box1_area + box2_area - intersection_area
+    # 计算IOU
+    iou = intersection_area / union_area
+    return iou
+def adjust_coordinates(box):
+    size = 32
+    (x1, y1, x2, y2) = box
+    if x1 % size != 0:
+        x1 = (x1 // size) * size
+    if x2 % size != 0:
+        x2 = (x2 // size + 1) * size
+    if y1 % size != 0:
+        y1 = (y1 // size) * size
+    if y2 % size != 0:
+        y2 = (y2 // size + 1) * size
+    return (x1, y1, x2, y2)
+def adjust_validation_box(validation_box):
+    return [adjust_coordinates(box) for box in validation_box]
+def get_list_layer_box(list_png_images):
+    list_layer_box = []
+    for img in list_png_images:
+        img_np = np.array(img)
+        alpha_channel = img_np[:, :, -1]
+        # Step 1: Find the non-zero indices
+        rows, cols = np.nonzero(alpha_channel)
+        if (len(rows) == 0) or (len(cols) == 0):
+            # If there are no non-zero indices, we can skip this layer
+            list_layer_box.append((0, 0, 0, 0))
+            continue
+        # Step 2: Get the minimum and maximum indices for rows and columns
+        min_row, max_row = rows.min().item(), rows.max().item()
+        min_col, max_col = cols.min().item(), cols.max().item()
+        # Step 3: Quantize the minimum values down to the nearest multiple of 8
+        quantized_min_row = (min_row // 8) * 8
+        quantized_min_col = (min_col // 8) * 8
+        # Step 4: Quantize the maximum values up to the nearest multiple of 8 outside of the max
+        quantized_max_row = ((max_row // 8) + 1) * 8
+        quantized_max_col = ((max_col // 8) + 1) * 8
+        list_layer_box.append(
+            (quantized_min_col, quantized_min_row, quantized_max_col, quantized_max_row)
+        )
+    return list_layer_box
+def pngs_to_svg(list_png_images):
+    list_layer_box = get_list_layer_box(list_png_images)
+    assert(len(list_png_images) == len(list_layer_box))
+    width, height = list_png_images[0].width, list_png_images[0].height
+    img_svg = ET.Element(
+       'svg',
+        {
+            "width": str(width),
+            "height": str(height),
+             "xmlns": "http://www.w3.org/2000/svg",
+             "xmlns:svg": "http://www.w3.org/2000/svg",
+             "xmlns:xlink":"http://www.w3.org/1999/xlink"
+        }
+    )
+    for img, box in zip(list_png_images, list_layer_box):
+        x, y, w, h = box[0], box[1], box[2]-box[0], box[3]-box[1]
+        if (w == 0 or h == 0):
+            continue
+        img = img.crop((x, y, x+w, y+h))
+        buffer = BytesIO()
+        img.save(buffer, format='PNG')
+        img_str = base64.b64encode(buffer.getvalue())
+        ET.SubElement(
+            img_svg,
+            "image",
+            {
+                "x": str(x),
+                "y": str(y),
+                "width": str(w),
+                "height": str(h),
+                "xlink:href": "data:image/png;base64,"+img_str.decode('utf-8')
+            }
+        )
+    return ET.tostring(img_svg, encoding='utf-8').decode('utf-8')
+def calculate_iou(box1, box2):
+    # 计算两个框的交集
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
+    # 计算两个框的并集
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union_area = box1_area + box2_area - intersection_area
+    # 计算IOU
+    iou = intersection_area / union_area
+    return iou
+# @spaces.GPU(enable_queue=True, duration=60)
+def buildmodel(**kwargs):
+    from modeling_crello import CrelloModel, CrelloModelConfig
+    from quantizer import get_quantizer
+    # seed / input model / resume
+    resume = kwargs.get('resume', None)
+    seed = kwargs.get('seed', None)
+    input_model = kwargs.get('input_model', None)
+    quantizer_version = kwargs.get('quantizer_version', 'v4')
+    device = "cuda"
+    set_seed(seed)
+    # old_tokenizer = AutoTokenizer.from_pretrained(input_model, trust_remote_code=True)
+    old_tokenizer = AutoTokenizer.from_pretrained(
+        "WYBar/LLM_For_Layout_Planning",  # 仓库路径
+        subfolder="Meta-Llama-3-8B",       # 子目录对应模型文件夹
+        trust_remote_code=True,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    )
+    old_vocab_size = len(old_tokenizer)
+    # tokenizer = AutoTokenizer.from_pretrained(resume, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        "WYBar/LLM_For_Layout_Planning",
+        subfolder="checkpoint-26000",     # 检查点所在子目录
+        trust_remote_code=True,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    )
+    quantizer = get_quantizer(
+                    quantizer_version,
+                    update_vocab = False,
+                    decimal_quantize_types = kwargs.get('decimal_quantize_types'),
+                    mask_values = kwargs['mask_values'],
+                    width = kwargs['width'],
+                    height = kwargs['height'],
+                    simplify_json = False,
+                    num_mask_tokens = 0,
+                    mask_type = kwargs.get('mask_type'),
+                )
+    quantizer.setup_tokenizer(tokenizer)
+    model_args = CrelloModelConfig(
+        old_vocab_size = old_vocab_size,
+        vocab_size=len(tokenizer),
+        pad_token_id=tokenizer.pad_token_id,
+        ignore_ids=tokenizer.convert_tokens_to_ids(quantizer.ignore_tokens),
+    )
+    model_args.freeze_lm = True
+    model_args.opt_version = "WYBar/LLM_For_Layout_Planning"
+    model_args.use_lora = False
+    model_args.load_in_4bit = kwargs.get('load_in_4bit', False)
+    # model = CrelloModel.from_pretrained(
+    #     resume,
+    #     config=model_args
+    # ).to(device)
+    # model = CrelloModel.from_pretrained(
+    #     "WYBar/LLM_For_Layout_Planning",
+    #     subfolder="checkpoint-26000",      # 加载检查点目录
+    #     config=model_args,
+    #     # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    # )
+    model = CrelloModel(config=model_args)
+    print("before .to(device)")
+    model = model.to(device)
+    print("after .to(device)")
+    model = model.bfloat16()
+    model.eval()
+    tokenizer.add_special_tokens({"mask_token": "<mask>"})
+    quantizer.additional_special_tokens.add("<mask>")
+    added_special_tokens_list = ["<layout>", "<position>", "<wholecaption>"]
+    tokenizer.add_special_tokens({"additional_special_tokens": added_special_tokens_list}, replace_additional_special_tokens=False)
+    for token in added_special_tokens_list:
+        quantizer.additional_special_tokens.add(token)
+    return model, quantizer, tokenizer
+def construction_layout():
+    params_dict = {
+        # 需要修改
+        "input_model": "WYBar/LLM_For_Layout_Planning",
+        "resume": "WYBar/LLM_For_Layout_Planning",
+        "seed": 0,
+        "mask_values": False,
+        "quantizer_version": 'v4',
+        "mask_type": 'cm3',
+        "decimal_quantize_types": [],
+        "num_mask_tokens": 0,
+        "width": 512,
+        "height": 512,
+        "device": 0,
+    }
+    device = "cuda"
+    # Init model
+    model, quantizer, tokenizer = buildmodel(**params_dict)
+    print('resize token embeddings to match the tokenizer', 129423)
+    model.lm.resize_token_embeddings(129423)
+    model.input_embeddings = model.lm.get_input_embeddings()
+    print('after token embeddings to match the tokenizer', 129423)
+    return model, quantizer, tokenizer, params_dict["width"], params_dict["height"], device
+@torch.no_grad()
+@spaces.GPU(enable_queue=True, duration=60)
+def evaluate_v1(inputs, model, quantizer, tokenizer, width, height, device, do_sample=False, temperature=1.0, top_p=1.0, top_k=50):
+    json_example = inputs
+    input_intension = '{"wholecaption":"' + json_example["wholecaption"] + '","layout":[{"layer":'
+    inputs = tokenizer(
+            input_intension, return_tensors="pt"
+        ).to(device)
+    stopping_criteria = StoppingCriteriaList()
+    stopping_criteria.append(StopAtSpecificTokenCriteria(token_id_list=[128000]))
+    outputs = model.lm.generate(**inputs, use_cache=True, max_length=8000, stopping_criteria=stopping_criteria, do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=top_k)
+    inputs_length = inputs['input_ids'].shape[1]
+    outputs = outputs[:, inputs_length:]
+    outputs_word = tokenizer.batch_decode(outputs)[0]
+    split_word = outputs_word.split('}]}')[0]+"}]}"
+    split_word = '{"wholecaption":"' + json_example["wholecaption"].replace('\n', '\\n').replace('"', '\\"') + '","layout":[{"layer":' + split_word
+    map_dict = quantizer.construct_map_dict()
+    for key ,value in map_dict.items():
+        split_word = split_word.replace(key, value)
+    try:
+        pred_json_example = json.loads(split_word)
+        for layer in pred_json_example["layout"]:
+            layer['x'] = round(int(width)*layer['x'])
+            layer['y'] = round(int(height)*layer['y'])
+            layer['width'] = round(int(width)*layer['width'])
+            layer['height'] = round(int(height)*layer['height'])
+    except Exception as e:
+        print(e)
+        pred_json_example = None
+    return pred_json_example
+def inference(generate_method, intention, model, quantizer, tokenizer, width, height, device, do_sample=True, temperature=1.0, top_p=1.0, top_k=50):
+    def FormulateInput(intension: str):
+        resdict = {}
+        resdict["wholecaption"] = intension
+        resdict["layout"] = []
+        return resdict
+    rawdata = FormulateInput(intention)
+    if generate_method == 'v1':
+        max_try_time = 5
+        preddata = None
+        while preddata is None and max_try_time > 0:
+            preddata = evaluate_v1(rawdata, model, quantizer, tokenizer, width, height, device, do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=top_k)
+            max_try_time -= 1
+    else:
+        print("Please input correct generate method")
+        preddata = None
+    return preddata
+# @spaces.GPU(enable_queue=True, duration=60)
+def construction():
+    from custom_model_mmdit import CustomFluxTransformer2DModel
+    from custom_model_transp_vae import AutoencoderKLTransformerTraining as CustomVAE
+    from custom_pipeline import CustomFluxPipelineCfg
+    transformer = CustomFluxTransformer2DModel.from_pretrained(
+        "WYBar/ART_test_weights",
+        subfolder="fused_transformer",
+        torch_dtype=torch.bfloat16,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    )
+    transp_vae = CustomVAE.from_pretrained(
+        "WYBar/ART_test_weights",
+        subfolder="custom_vae",
+        torch_dtype=torch.float32,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    )
+    token = os.environ.get("HF_TOKEN")
+    pipeline = CustomFluxPipelineCfg.from_pretrained(
+        "black-forest-labs/FLUX.1-dev",
+        transformer=transformer,
+        torch_dtype=torch.bfloat16,
+        token=token,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    ).to("cuda")
+    pipeline.enable_model_cpu_offload(gpu_id=0) # Save GPU memory
+    return pipeline, transp_vae
+@spaces.GPU(enable_queue=True, duration=60)
+def test_one_sample(validation_box, validation_prompt, true_gs, inference_steps, pipeline, generator, transp_vae):
+    print(validation_box)
+    output, rgba_output, _, _ = pipeline(
+        prompt=validation_prompt,
+        validation_box=validation_box,
+        generator=generator,
+        height=512,
+        width=512,
+        num_layers=len(validation_box),
+        guidance_scale=4.0,
+        num_inference_steps=inference_steps,
+        transparent_decoder=transp_vae,
+        true_gs=true_gs
+    )
+    images = output.images   # list of PIL, len=layers
+    rgba_images = [Image.fromarray(arr, 'RGBA') for arr in rgba_output]
+    output_gradio = []
+    merged_pil = images[1].convert('RGBA')
+    for frame_idx, frame_pil in enumerate(rgba_images):
+        if frame_idx < 2:
+            frame_pil = images[frame_idx].convert('RGBA') # merged and background
+        else:
+            merged_pil = Image.alpha_composite(merged_pil, frame_pil)
+        output_gradio.append(frame_pil)
+    return output_gradio
+def svg_test_one_sample(validation_prompt, validation_box_str, seed, true_gs, inference_steps, pipeline, transp_vae):
+    generator = torch.Generator().manual_seed(seed)
+    try:
+        validation_box = ast.literal_eval(validation_box_str)
+    except Exception as e:
+        return [f"Error parsing validation_box: {e}"]
+    if not isinstance(validation_box, list) or not all(isinstance(t, tuple) and len(t) == 4 for t in validation_box):
+        return ["validation_box must be a list of tuples, each of length 4."]
+    validation_box = adjust_validation_box(validation_box)
+    result_images = test_one_sample(validation_box, validation_prompt, true_gs, inference_steps, pipeline, generator, transp_vae)
+    svg_img = pngs_to_svg(result_images[1:])
+    svg_file_path = './image.svg'
+    os.makedirs(os.path.dirname(svg_file_path), exist_ok=True)
+    with open(svg_file_path, 'w', encoding='utf-8') as f:
+        f.write(svg_img)
+    return result_images, svg_file_path
+def main():
+    model, quantizer, tokenizer, width, height, device = construction_layout()
+    inference_partial = partial(
+        inference,
+        model=model,
+        quantizer=quantizer,
+        tokenizer=tokenizer,
+        width=width,
+        height=height,
+        device=device
+    )
+    def process_preddate(intention, temperature, top_p, generate_method='v1'):
+        intention = intention.replace('\n', '').replace('\r', '').replace('\\', '')
+        intention = ensure_space_after_period(intention)
+        if temperature == 0.0:
+            # print("looking for greedy decoding strategies, set `do_sample=False`.")
+            preddata = inference_partial(generate_method, intention, do_sample=False)
+        else:
+            preddata = inference_partial(generate_method, intention, temperature=temperature, top_p=top_p)
+        # wholecaption = preddata["wholecaption"]
+        layouts = preddata["layout"]
+        list_box = []
+        for i, layout in enumerate(layouts):
+            x, y = layout["x"], layout["y"]
+            width, height = layout["width"], layout["height"]
+            if i == 0:
+                list_box.append((0, 0, width, height))
+                list_box.append((0, 0, width, height))
+            else:
+                left = x - width // 2
+                top = y - height // 2
+                right = x + width // 2
+                bottom = y + height // 2
+                list_box.append((left, top, right, bottom))
+        # print(list_box)
+        filtered_boxes = list_box[:2]
+        for i in range(2, len(list_box)):
+            keep = True
+            for j in range(1, len(filtered_boxes)):
+                iou = calculate_iou(list_box[i], filtered_boxes[j])
+                if iou > 0.65:
+                    print(list_box[i], filtered_boxes[j])
+                    keep = False
+                    break
+            if keep:
+                filtered_boxes.append(list_box[i])
+        return str(filtered_boxes), intention, str(filtered_boxes)
+    # def process_preddate(intention, generate_method='v1'):
+    #     list_box = [(0, 0, 512, 512), (0, 0, 512, 512), (136, 184, 512, 512), (144, 0, 512, 512), (0, 0, 328, 136), (160, 112, 512, 360), (168, 112, 512, 360), (40, 232, 112, 296), (32, 88, 248, 176), (48, 424, 144, 448), (48, 464, 144, 488), (240, 464, 352, 488), (384, 464, 488, 488), (48, 480, 144, 504), (240, 480, 360, 504), (456, 0, 512, 56), (0, 0, 56, 40), (440, 0, 512, 40), (0, 24, 48, 88), (48, 168, 168, 240)]
+    #     wholecaption = "Design an engaging and vibrant recruitment advertisement for our company. The image should feature three animated characters in a modern cityscape, depicting a dynamic and collaborative work environment. Incorporate a light bulb graphic with a question mark, symbolizing innovation, creativity, and problem-solving. Use bold text to announce \"WE ARE RECRUITING\" and provide the company's social media handle \"@reallygreatsite\" and a contact phone number \"+123-456-7890\" for interested individuals. The overall design should be playful and youthful, attracting potential recruits who are innovative and eager to contribute to a lively team."
+    #     json_file = "/home/wyb/openseg_blob/v-yanbin/GradioDemo/LLM-For-Layout-Planning/inference_test.json"
+    #     return wholecaption, str(list_box), json_file
+    pipeline, transp_vae = construction()
+    gradio_test_one_sample_partial = partial(
+        svg_test_one_sample,
+        pipeline=pipeline,
+        transp_vae=transp_vae,
+    )
+    def process_svg(text_input, tuple_input, seed, true_gs, inference_steps):
+        result_images = []
+        result_images, svg_file_path = gradio_test_one_sample_partial(text_input, tuple_input, seed, true_gs, inference_steps)
+        url, unique_filename = upload_to_github(file_path=svg_file_path)
+        unique_filename = f'{unique_filename}'
+        if url != None:
+            print(f"File uploaded to: {url}")
+            svg_editor = f"""
+                <iframe src="https://svgedit.netlify.app/editor/index.html?\
+                storagePrompt=false&url={url}" \
+                width="100%", height="800px"></iframe>
+            """
+        else:
+            print('upload_to_github FAILED!')
+            svg_editor = f"""
+                <iframe src="https://svgedit.netlify.app/editor/index.html" \
+                width="100%", height="800px"></iframe>
+            """
+        return result_images, svg_file_path, svg_editor
+    def one_click_generate(intention_input, temperature, top_p, seed, true_gs, inference_steps):
+        # 首先调用process_preddate
+        list_box_output, intention_input, list_box_output = process_preddate(intention_input, temperature, top_p)
+        # 然后将process_preddate的输出作为process_svg的输入
+        result_images, svg_file, svg_editor = process_svg(intention_input, list_box_output, seed, true_gs, inference_steps)
+        # 返回两个函数的输出
+        return list_box_output, result_images, svg_file, svg_editor, intention_input, list_box_output
+    def clear_inputs1():
+        return "", ""
+    def clear_inputs2():
+        return "", ""
+    def transfer_inputs(intention, list_box):
+        return intention, list_box
+    theme = gr.themes.Soft(
+        radius_size="lg",
+    ).set(
+        block_background_fill='*primary_50',
+        block_border_color='*primary_200',
+        block_border_width='1px',
+        block_border_width_dark='100px',
+        block_info_text_color='*primary_950',
+        block_label_border_color='*primary_200',
+        block_radius='*radius_lg'
+    )
+    with gr.Blocks(theme=theme) as demo:
+        gr.HTML("<h1 style='text-align: center;'>ART: Anonymous Region Transformer for Variable Multi-Layer Transparent Image Generation</h1>")
+        gr.HTML("<h2>Anonymous Region Layout Planner</h2>")
+        with gr.Row():
+            with gr.Column():
+                intention_input = gr.Textbox(lines=15, placeholder="Enter intention", label="Prompt")
+                with gr.Row():
+                    temperature_input=gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Temperature", value=0.0)
+                    top_p_input=gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Top P", value=0.0)
+                with gr.Row():
+                    clear_btn1 = gr.Button("Clear")
+                    model_btn1 = gr.Button("Commit", variant='primary')
+                    transfer_btn1 = gr.Button("Export to below")
+                one_click_btn = gr.Button("One Click Generate ALL", variant='primary')
+            with gr.Column():
+                list_box_output = gr.Textbox(lines=10, placeholder="Validation Box", label="Validation Box")
+        examples = gr.Examples(
+            examples=[
+                ['The image is a graphic design with a celebratory theme. At the top, there is a banner with the text \"Happy Anniversary\" in a bold, sans-serif font. Below this banner, there is a circular frame containing a photograph of a couple. The man has short, dark hair and is wearing a light-colored sweater, while the woman has long blonde hair and is also wearing a light-colored sweater. They are both smiling and appear to be embracing each other.Surrounding the circular frame are decorative elements such as pink flowers and green leaves, which add a festive touch to the design. Below the circular frame, there is a text that reads "Isabel & Morgan" in a cursive, elegant font, suggesting that the couple\'s names are Isabel and Morgan.At the bottom of the image, there is a banner with a message that says "Happy Anniversary! Cheers to another year of love, laughter, and cherished memories together.\" This text is in a smaller, sans-serif font and is placed against a solid background, providing a clear message of celebration and well-wishes for the couple.The overall style of the image is warm and celebratory, with a color scheme that includes shades of pink, green, and white, which contribute to a joyful and romantic atmosphere.'],
+                ['The image is a digital illustration with a light blue background. At the top, there is a logo consisting of a snake wrapped around a staff, which is a common symbol in healthcare. Below the logo, the text "International Nurses Day" is prominently displayed in white, with the date "12 May 20xx" in smaller font size.The central part of the image features two stylized characters. On the left, there is a female character with dark hair, wearing a white nurse\'s uniform with a cap. She is holding a clipboard and appears to be speaking or gesturing, as indicated by a speech bubble with the word "OK" in it. On the right, there is a male character with light brown hair, wearing a light blue shirt with a white collar and a white apron. He is holding a stethoscope to his ear, suggesting he is a doctor or a healthcare professional.The characters are depicted in a friendly and approachable manner, with smiles on their faces. Around them, there are small blue plus signs, which are often associated with healthcare and medical services. The overall style of the image is clean, modern, and appears to be designed to celebrate International Nurses Day.'],
+                ['The image features a graphic design with a festive theme. At the top, there is a decorative border with a wavy pattern. Below this border, the text "WINTER SEASON SPECIAL COOKIES" is prominently displayed in a bold, sans-serif font. The text is black with a slight shadow effect, giving it a three-dimensional appearance.In the center of the image, there are three illustrated gingerbread cookies. Each cookie has a smiling face with eyes, a nose, and a mouth, and they are colored in a warm, brown hue. The cookies are arranged in a staggered formation, with the middle cookie slightly higher than the others, creating a sense of depth.At the bottom of the image, there is a call to action that reads "ORDER.NOW" in a large, bold, sans-serif font. The text is colored in a darker shade of brown, contrasting with the lighter background. The overall style of the image suggests it is an advertisement or promotional graphic for a winter-themed cookie special.']
+            ],
+            inputs=[intention_input]
+        )
+        gr.HTML("<h2>Anonymous Region Transformer</h2>")
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(lines=10, placeholder="Enter prompt text", label="Prompt")
+                tuple_input = gr.Textbox(lines=5, placeholder="Enter list of tuples, e.g., [(1, 2, 3, 4), (5, 6, 7, 8)]", label="Validation Box")
+                with gr.Row():
+                    true_gs_input=gr.Slider(minimum=3.0, maximum=5.0, step=0.1, label="true_gs", value=3.5)
+                    inference_steps_input=gr.Slider(minimum=5, maximum=50, step=1, label="inference_steps", value=28)
+                with gr.Row():
+                    seed_input = gr.Number(label="Seed", value=42)
+                with gr.Row():
+                    transfer_btn2 = gr.Button("Import from above")
+                with gr.Row():
+                    clear_btn2 = gr.Button("Clear")
+                    model_btn2 = gr.Button("Commit", variant='primary')
+            with gr.Column():
+                result_images = gr.Gallery(label="Result Images", columns=5, height='auto')
+        gr.HTML("<h1>SVG Image</h1>")
+        svg_file = gr.File(label="Download SVG Image")
+        svg_editor = gr.HTML(label="Editable SVG Editor")
+        model_btn1.click(
+            fn=process_preddate,
+            inputs=[intention_input, temperature_input, top_p_input],
+            outputs=[list_box_output, text_input, tuple_input],
+            api_name="process_preddate"
+        )
+        clear_btn1.click(
+            fn=clear_inputs1,
+            inputs=[],
+            outputs=[intention_input, list_box_output]
+        )
+        model_btn2.click(
+            fn=process_svg,
+            inputs=[text_input, tuple_input, seed_input, true_gs_input, inference_steps_input],
+            outputs=[result_images, svg_file, svg_editor],
+            api_name="process_svg"
+        )
+        clear_btn2.click(
+            fn=clear_inputs2,
+            inputs=[],
+            outputs=[text_input, tuple_input]
+        )
+        transfer_btn1.click(
+            fn=transfer_inputs,
+            inputs=[intention_input, list_box_output],
+            outputs=[text_input, tuple_input]
+        )
+        transfer_btn2.click(
+            fn=transfer_inputs,
+            inputs=[intention_input, list_box_output],
+            outputs=[text_input, tuple_input]
+        )
+        one_click_btn.click(
+            fn=one_click_generate,
+            inputs=[intention_input, temperature_input, top_p_input, seed_input, true_gs_input, inference_steps_input],
+            outputs=[list_box_output, result_images, svg_file, svg_editor, text_input, tuple_input]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    main()

app_test.py ADDED Viewed

	@@ -0,0 +1,684 @@

+import os
+# import spaces
+import ast
+import numpy as np
+from functools import partial
+import torch
+import torch.utils.checkpoint
+from PIL import Image
+import xml.etree.cElementTree as ET
+from io import BytesIO
+import base64
+import json
+import gradio as gr
+from functools import partial
+import requests
+import base64
+import os
+import time
+import re
+from transformers import (
+    AutoTokenizer,
+    set_seed
+)
+from typing import List
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList, \
+    STOPPING_CRITERIA_INPUTS_DOCSTRING, add_start_docstrings
+class StopAtSpecificTokenCriteria(StoppingCriteria):
+    def __init__(self, token_id_list: List[int] = None):
+        self.token_id_list = token_id_list
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        return input_ids[0][-1].detach().cpu().numpy() in self.token_id_list
+def ensure_space_after_period(input_string):
+    # 去除多余的空格
+    output_string = re.sub(r'\.\s*', '. ', input_string)
+    return output_string
+def generate_unique_filename():
+    # 生成一个基于时间戳和随机数的唯一文件名
+    timestamp = int(time.time() * 1000)  # 时间戳，毫秒级
+    # random_num = random.randint(1000, 9999)  # 随机数
+    unique_filename = f"{timestamp}"
+    return unique_filename
+def upload_to_github(file_path,
+                     repo='WYBar/gradiodemo_svg',
+                     branch='main',
+                     token='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'):
+    if not os.path.isfile(file_path):
+        print(f"File not found: {file_path}")
+        return None
+    with open(file_path, 'rb') as file:
+        content = file.read()
+    encoded_content = base64.b64encode(content).decode('utf-8')
+    unique_filename = generate_unique_filename()
+    url = f"https://api.github.com/repos/{repo}/contents/{unique_filename}.svg"
+    headers = {
+        "Authorization": f"token {token}"
+    }
+    response = requests.get(url, headers=headers)
+    sha = None
+    if response.status_code == 200:
+        sha = response.json()['sha']
+    elif response.status_code == 404:
+        # 文件不存在，不需要SHA
+        pass
+    else:
+        print(f"Failed to get file status: {response.status_code}")
+        # print(response.text)
+        return None
+    headers = {
+        "Authorization": f"token {token}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "message": "upload svg file",
+        "content": encoded_content,
+        "branch": branch
+    }
+    if sha:
+        # 文件存在，更新文件
+        # print('sha exists, update the old one')
+        data["sha"] = sha
+        response = requests.put(url, headers=headers, json=data)
+    else:
+        # 文件不存在，创建新文件
+        print("sha not exist, need to create a new one")
+        response = requests.put(url, headers=headers, json=data)
+    # print(response.status_code)
+    # print(response.text)
+    if response.status_code in [200, 201]:
+        # print(response.json()['content']['download_url'])
+        return response.json()['content']['download_url'], unique_filename
+    else:
+        print("None")
+        return None
+def calculate_iou(box1, box2):
+    # 计算两个框的交集
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
+    # 计算两个框的并集
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union_area = box1_area + box2_area - intersection_area
+    # 计算IOU
+    iou = intersection_area / union_area
+    return iou
+def adjust_coordinates(box):
+    size = 32
+    (x1, y1, x2, y2) = box
+    if x1 % size != 0:
+        x1 = (x1 // size) * size
+    if x2 % size != 0:
+        x2 = (x2 // size + 1) * size
+    if y1 % size != 0:
+        y1 = (y1 // size) * size
+    if y2 % size != 0:
+        y2 = (y2 // size + 1) * size
+    return (x1, y1, x2, y2)
+def adjust_validation_box(validation_box):
+    return [adjust_coordinates(box) for box in validation_box]
+def get_list_layer_box(list_png_images):
+    list_layer_box = []
+    for img in list_png_images:
+        img_np = np.array(img)
+        alpha_channel = img_np[:, :, -1]
+        # Step 1: Find the non-zero indices
+        rows, cols = np.nonzero(alpha_channel)
+        if (len(rows) == 0) or (len(cols) == 0):
+            # If there are no non-zero indices, we can skip this layer
+            list_layer_box.append((0, 0, 0, 0))
+            continue
+        # Step 2: Get the minimum and maximum indices for rows and columns
+        min_row, max_row = rows.min().item(), rows.max().item()
+        min_col, max_col = cols.min().item(), cols.max().item()
+        # Step 3: Quantize the minimum values down to the nearest multiple of 8
+        quantized_min_row = (min_row // 8) * 8
+        quantized_min_col = (min_col // 8) * 8
+        # Step 4: Quantize the maximum values up to the nearest multiple of 8 outside of the max
+        quantized_max_row = ((max_row // 8) + 1) * 8
+        quantized_max_col = ((max_col // 8) + 1) * 8
+        list_layer_box.append(
+            (quantized_min_col, quantized_min_row, quantized_max_col, quantized_max_row)
+        )
+    return list_layer_box
+def pngs_to_svg(list_png_images):
+    list_layer_box = get_list_layer_box(list_png_images)
+    assert(len(list_png_images) == len(list_layer_box))
+    width, height = list_png_images[0].width, list_png_images[0].height
+    img_svg = ET.Element(
+       'svg',
+        {
+            "width": str(width),
+            "height": str(height),
+             "xmlns": "http://www.w3.org/2000/svg",
+             "xmlns:svg": "http://www.w3.org/2000/svg",
+             "xmlns:xlink":"http://www.w3.org/1999/xlink"
+        }
+    )
+    for img, box in zip(list_png_images, list_layer_box):
+        x, y, w, h = box[0], box[1], box[2]-box[0], box[3]-box[1]
+        if (w == 0 or h == 0):
+            continue
+        img = img.crop((x, y, x+w, y+h))
+        buffer = BytesIO()
+        img.save(buffer, format='PNG')
+        img_str = base64.b64encode(buffer.getvalue())
+        ET.SubElement(
+            img_svg,
+            "image",
+            {
+                "x": str(x),
+                "y": str(y),
+                "width": str(w),
+                "height": str(h),
+                "xlink:href": "data:image/png;base64,"+img_str.decode('utf-8')
+            }
+        )
+    return ET.tostring(img_svg, encoding='utf-8').decode('utf-8')
+def calculate_iou(box1, box2):
+    # 计算两个框的交集
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
+    # 计算两个框的并集
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union_area = box1_area + box2_area - intersection_area
+    # 计算IOU
+    iou = intersection_area / union_area
+    return iou
+# @spaces.GPU(enable_queue=True, duration=60)
+def buildmodel(**kwargs):
+    from modeling_crello import CrelloModel, CrelloModelConfig
+    from quantizer import get_quantizer
+    # seed / input model / resume
+    resume = kwargs.get('resume', None)
+    seed = kwargs.get('seed', None)
+    input_model = kwargs.get('input_model', None)
+    quantizer_version = kwargs.get('quantizer_version', 'v4')
+    device = "cuda"
+    set_seed(seed)
+    # old_tokenizer = AutoTokenizer.from_pretrained(input_model, trust_remote_code=True)
+    old_tokenizer = AutoTokenizer.from_pretrained(
+        "WYBar/LLM_For_Layout_Planning",  # 仓库路径
+        subfolder="Meta-Llama-3-8B",       # 子目录对应模型文件夹
+        trust_remote_code=True,
+        cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    )
+    old_vocab_size = len(old_tokenizer)
+    # tokenizer = AutoTokenizer.from_pretrained(resume, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        "WYBar/LLM_For_Layout_Planning",
+        subfolder="checkpoint-26000",     # 检查点所在子目录
+        trust_remote_code=True,
+        cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    )
+    quantizer = get_quantizer(
+                    quantizer_version,
+                    update_vocab = False,
+                    decimal_quantize_types = kwargs.get('decimal_quantize_types'),
+                    mask_values = kwargs['mask_values'],
+                    width = kwargs['width'],
+                    height = kwargs['height'],
+                    simplify_json = False,
+                    num_mask_tokens = 0,
+                    mask_type = kwargs.get('mask_type'),
+                )
+    quantizer.setup_tokenizer(tokenizer)
+    model_args = CrelloModelConfig(
+        old_vocab_size = old_vocab_size,
+        vocab_size=len(tokenizer),
+        pad_token_id=tokenizer.pad_token_id,
+        ignore_ids=tokenizer.convert_tokens_to_ids(quantizer.ignore_tokens),
+    )
+    model_args.freeze_lm = True
+    model_args.opt_version = "WYBar/LLM_For_Layout_Planning"
+    model_args.use_lora = False
+    model_args.load_in_4bit = kwargs.get('load_in_4bit', False)
+    # model = CrelloModel.from_pretrained(
+    #     resume,
+    #     config=model_args
+    # ).to(device)
+    # model = CrelloModel.from_pretrained(
+    #     "WYBar/LLM_For_Layout_Planning",
+    #     subfolder="checkpoint-26000",      # 加载检查点目录
+    #     config=model_args,
+    #     # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+    # )
+    model = CrelloModel(config=model_args)
+    print("before .to(device)")
+    model = model.to(device)
+    print("after .to(device)")
+    model = model.bfloat16()
+    model.eval()
+    tokenizer.add_special_tokens({"mask_token": "<mask>"})
+    quantizer.additional_special_tokens.add("<mask>")
+    added_special_tokens_list = ["<layout>", "<position>", "<wholecaption>"]
+    tokenizer.add_special_tokens({"additional_special_tokens": added_special_tokens_list}, replace_additional_special_tokens=False)
+    for token in added_special_tokens_list:
+        quantizer.additional_special_tokens.add(token)
+    return model, quantizer, tokenizer
+def construction_layout():
+    params_dict = {
+        # 需要修改
+        "input_model": "WYBar/LLM_For_Layout_Planning",
+        "resume": "WYBar/LLM_For_Layout_Planning",
+        "seed": 0,
+        "mask_values": False,
+        "quantizer_version": 'v4',
+        "mask_type": 'cm3',
+        "decimal_quantize_types": [],
+        "num_mask_tokens": 0,
+        "width": 512,
+        "height": 512,
+        "device": 0,
+    }
+    device = "cuda"
+    # Init model
+    model, quantizer, tokenizer = buildmodel(**params_dict)
+    print('resize token embeddings to match the tokenizer', 129423)
+    model.lm.resize_token_embeddings(129423)
+    model.input_embeddings = model.lm.get_input_embeddings()
+    print('after token embeddings to match the tokenizer', 129423)
+    return model, quantizer, tokenizer, params_dict["width"], params_dict["height"], device
+@torch.no_grad()
+# @spaces.GPU(enable_queue=True, duration=60)
+def evaluate_v1(inputs, model, quantizer, tokenizer, width, height, device, do_sample=False, temperature=1.0, top_p=1.0, top_k=50):
+    json_example = inputs
+    input_intension = '{"wholecaption":"' + json_example["wholecaption"] + '","layout":[{"layer":'
+    inputs = tokenizer(
+            input_intension, return_tensors="pt"
+        ).to(device)
+    stopping_criteria = StoppingCriteriaList()
+    stopping_criteria.append(StopAtSpecificTokenCriteria(token_id_list=[128000]))
+    outputs = model.lm.generate(**inputs, use_cache=True, max_length=8000, stopping_criteria=stopping_criteria, do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=top_k)
+    inputs_length = inputs['input_ids'].shape[1]
+    outputs = outputs[:, inputs_length:]
+    outputs_word = tokenizer.batch_decode(outputs)[0]
+    split_word = outputs_word.split('}]}')[0]+"}]}"
+    split_word = '{"wholecaption":"' + json_example["wholecaption"].replace('\n', '\\n').replace('"', '\\"') + '","layout":[{"layer":' + split_word
+    map_dict = quantizer.construct_map_dict()
+    for key ,value in map_dict.items():
+        split_word = split_word.replace(key, value)
+    try:
+        pred_json_example = json.loads(split_word)
+        for layer in pred_json_example["layout"]:
+            layer['x'] = round(int(width)*layer['x'])
+            layer['y'] = round(int(height)*layer['y'])
+            layer['width'] = round(int(width)*layer['width'])
+            layer['height'] = round(int(height)*layer['height'])
+    except Exception as e:
+        print(e)
+        pred_json_example = None
+    return pred_json_example
+def inference(generate_method, intention, model, quantizer, tokenizer, width, height, device, do_sample=True, temperature=1.0, top_p=1.0, top_k=50):
+    def FormulateInput(intension: str):
+        resdict = {}
+        resdict["wholecaption"] = intension
+        resdict["layout"] = []
+        return resdict
+    rawdata = FormulateInput(intention)
+    if generate_method == 'v1':
+        max_try_time = 5
+        preddata = None
+        while preddata is None and max_try_time > 0:
+            preddata = evaluate_v1(rawdata, model, quantizer, tokenizer, width, height, device, do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=top_k)
+            max_try_time -= 1
+    else:
+        print("Please input correct generate method")
+        preddata = None
+    return preddata
+# @spaces.GPU(enable_queue=True, duration=60)
+def construction():
+    from custom_model_mmdit import CustomFluxTransformer2DModel
+    from custom_model_transp_vae import AutoencoderKLTransformerTraining as CustomVAE
+    from custom_pipeline import CustomFluxPipelineCfg
+    transformer = CustomFluxTransformer2DModel.from_pretrained(
+        "WYBar/ART_test_weights",
+        subfolder="fused_transformer",
+        torch_dtype=torch.bfloat16,
+        cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    )
+    transp_vae = CustomVAE.from_pretrained(
+        "WYBar/ART_test_weights",
+        subfolder="custom_vae",
+        torch_dtype=torch.float32,
+        cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    )
+    token = os.environ.get("HF_TOKEN")
+    pipeline = CustomFluxPipelineCfg.from_pretrained(
+        "black-forest-labs/FLUX.1-dev",
+        transformer=transformer,
+        torch_dtype=torch.bfloat16,
+        token=token,
+        cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir"
+    ).to("cuda")
+    pipeline.enable_model_cpu_offload(gpu_id=0) # Save GPU memory
+    return pipeline, transp_vae
+# @spaces.GPU(enable_queue=True, duration=60)
+def test_one_sample(validation_box, validation_prompt, true_gs, inference_steps, pipeline, generator, transp_vae):
+    print(validation_box)
+    output, rgba_output, _, _ = pipeline(
+        prompt=validation_prompt,
+        validation_box=validation_box,
+        generator=generator,
+        height=512,
+        width=512,
+        num_layers=len(validation_box),
+        guidance_scale=4.0,
+        num_inference_steps=inference_steps,
+        transparent_decoder=transp_vae,
+        true_gs=true_gs
+    )
+    images = output.images   # list of PIL, len=layers
+    rgba_images = [Image.fromarray(arr, 'RGBA') for arr in rgba_output]
+    output_gradio = []
+    merged_pil = images[1].convert('RGBA')
+    for frame_idx, frame_pil in enumerate(rgba_images):
+        if frame_idx < 2:
+            frame_pil = images[frame_idx].convert('RGBA') # merged and background
+        else:
+            merged_pil = Image.alpha_composite(merged_pil, frame_pil)
+        output_gradio.append(frame_pil)
+    return output_gradio
+def svg_test_one_sample(validation_prompt, validation_box_str, seed, true_gs, inference_steps, pipeline, transp_vae):
+    generator = torch.Generator().manual_seed(seed)
+    try:
+        validation_box = ast.literal_eval(validation_box_str)
+    except Exception as e:
+        return [f"Error parsing validation_box: {e}"]
+    if not isinstance(validation_box, list) or not all(isinstance(t, tuple) and len(t) == 4 for t in validation_box):
+        return ["validation_box must be a list of tuples, each of length 4."]
+    validation_box = adjust_validation_box(validation_box)
+    result_images = test_one_sample(validation_box, validation_prompt, true_gs, inference_steps, pipeline, generator, transp_vae)
+    svg_img = pngs_to_svg(result_images[1:])
+    svg_file_path = './image.svg'
+    os.makedirs(os.path.dirname(svg_file_path), exist_ok=True)
+    with open(svg_file_path, 'w', encoding='utf-8') as f:
+        f.write(svg_img)
+    return result_images, svg_file_path
+def main():
+    model, quantizer, tokenizer, width, height, device = construction_layout()
+    inference_partial = partial(
+        inference,
+        model=model,
+        quantizer=quantizer,
+        tokenizer=tokenizer,
+        width=width,
+        height=height,
+        device=device
+    )
+    def process_preddate(intention, temperature, top_p, generate_method='v1'):
+        intention = intention.replace('\n', '').replace('\r', '').replace('\\', '')
+        intention = ensure_space_after_period(intention)
+        if temperature == 0.0:
+            # print("looking for greedy decoding strategies, set `do_sample=False`.")
+            preddata = inference_partial(generate_method, intention, do_sample=False)
+        else:
+            preddata = inference_partial(generate_method, intention, temperature=temperature, top_p=top_p)
+        # wholecaption = preddata["wholecaption"]
+        layouts = preddata["layout"]
+        list_box = []
+        for i, layout in enumerate(layouts):
+            x, y = layout["x"], layout["y"]
+            width, height = layout["width"], layout["height"]
+            if i == 0:
+                list_box.append((0, 0, width, height))
+                list_box.append((0, 0, width, height))
+            else:
+                left = x - width // 2
+                top = y - height // 2
+                right = x + width // 2
+                bottom = y + height // 2
+                list_box.append((left, top, right, bottom))
+        # print(list_box)
+        filtered_boxes = list_box[:2]
+        for i in range(2, len(list_box)):
+            keep = True
+            for j in range(1, len(filtered_boxes)):
+                iou = calculate_iou(list_box[i], filtered_boxes[j])
+                if iou > 0.65:
+                    print(list_box[i], filtered_boxes[j])
+                    keep = False
+                    break
+            if keep:
+                filtered_boxes.append(list_box[i])
+        return str(filtered_boxes), intention, str(filtered_boxes)
+    # def process_preddate(intention, generate_method='v1'):
+    #     list_box = [(0, 0, 512, 512), (0, 0, 512, 512), (136, 184, 512, 512), (144, 0, 512, 512), (0, 0, 328, 136), (160, 112, 512, 360), (168, 112, 512, 360), (40, 232, 112, 296), (32, 88, 248, 176), (48, 424, 144, 448), (48, 464, 144, 488), (240, 464, 352, 488), (384, 464, 488, 488), (48, 480, 144, 504), (240, 480, 360, 504), (456, 0, 512, 56), (0, 0, 56, 40), (440, 0, 512, 40), (0, 24, 48, 88), (48, 168, 168, 240)]
+    #     wholecaption = "Design an engaging and vibrant recruitment advertisement for our company. The image should feature three animated characters in a modern cityscape, depicting a dynamic and collaborative work environment. Incorporate a light bulb graphic with a question mark, symbolizing innovation, creativity, and problem-solving. Use bold text to announce \"WE ARE RECRUITING\" and provide the company's social media handle \"@reallygreatsite\" and a contact phone number \"+123-456-7890\" for interested individuals. The overall design should be playful and youthful, attracting potential recruits who are innovative and eager to contribute to a lively team."
+    #     json_file = "/home/wyb/openseg_blob/v-yanbin/GradioDemo/LLM-For-Layout-Planning/inference_test.json"
+    #     return wholecaption, str(list_box), json_file
+    pipeline, transp_vae = construction()
+    gradio_test_one_sample_partial = partial(
+        svg_test_one_sample,
+        pipeline=pipeline,
+        transp_vae=transp_vae,
+    )
+    def process_svg(text_input, tuple_input, seed, true_gs, inference_steps):
+        result_images = []
+        result_images, svg_file_path = gradio_test_one_sample_partial(text_input, tuple_input, seed, true_gs, inference_steps)
+        url, unique_filename = upload_to_github(file_path=svg_file_path)
+        unique_filename = f'{unique_filename}'
+        if url != None:
+            print(f"File uploaded to: {url}")
+            svg_editor = f"""
+                <iframe src="https://svgedit.netlify.app/editor/index.html?\
+                storagePrompt=false&url={url}" \
+                width="100%", height="800px"></iframe>
+            """
+        else:
+            print('upload_to_github FAILED!')
+            svg_editor = f"""
+                <iframe src="https://svgedit.netlify.app/editor/index.html" \
+                width="100%", height="800px"></iframe>
+            """
+        return result_images, svg_file_path, svg_editor
+    def one_click_generate(intention_input, temperature, top_p, seed, true_gs, inference_steps):
+        # 首先调用process_preddate
+        list_box_output, intention_input, list_box_output = process_preddate(intention_input, temperature, top_p)
+        # 然后将process_preddate的输出作为process_svg的输入
+        result_images, svg_file, svg_editor = process_svg(intention_input, list_box_output, seed, true_gs, inference_steps)
+        # 返回两个函数的输出
+        return list_box_output, result_images, svg_file, svg_editor, intention_input, list_box_output
+    def clear_inputs1():
+        return "", ""
+    def clear_inputs2():
+        return "", ""
+    def transfer_inputs(intention, list_box):
+        return intention, list_box
+    theme = gr.themes.Soft(
+        radius_size="lg",
+    ).set(
+        block_background_fill='*primary_50',
+        block_border_color='*primary_200',
+        block_border_width='1px',
+        block_border_width_dark='100px',
+        block_info_text_color='*primary_950',
+        block_label_border_color='*primary_200',
+        block_radius='*radius_lg'
+    )
+    with gr.Blocks(theme=theme) as demo:
+        gr.HTML("<h1 style='text-align: center;'>ART: Anonymous Region Transformer for Variable Multi-Layer Transparent Image Generation</h1>")
+        gr.HTML("<h2>Anonymous Region Layout Planner</h2>")
+        with gr.Row():
+            with gr.Column():
+                intention_input = gr.Textbox(lines=15, placeholder="Enter intention", label="Prompt")
+                with gr.Row():
+                    temperature_input=gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Temperature", value=0.0)
+                    top_p_input=gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Top P", value=0.0)
+                with gr.Row():
+                    clear_btn1 = gr.Button("Clear")
+                    model_btn1 = gr.Button("Commit", variant='primary')
+                    transfer_btn1 = gr.Button("Export to below")
+                one_click_btn = gr.Button("One Click Generate ALL", variant='primary')
+            with gr.Column():
+                list_box_output = gr.Textbox(lines=10, placeholder="Validation Box", label="Validation Box")
+        examples = gr.Examples(
+            examples=[
+                ['The image is a graphic design with a celebratory theme. At the top, there is a banner with the text \"Happy Anniversary\" in a bold, sans-serif font. Below this banner, there is a circular frame containing a photograph of a couple. The man has short, dark hair and is wearing a light-colored sweater, while the woman has long blonde hair and is also wearing a light-colored sweater. They are both smiling and appear to be embracing each other.Surrounding the circular frame are decorative elements such as pink flowers and green leaves, which add a festive touch to the design. Below the circular frame, there is a text that reads "Isabel & Morgan" in a cursive, elegant font, suggesting that the couple\'s names are Isabel and Morgan.At the bottom of the image, there is a banner with a message that says "Happy Anniversary! Cheers to another year of love, laughter, and cherished memories together.\" This text is in a smaller, sans-serif font and is placed against a solid background, providing a clear message of celebration and well-wishes for the couple.The overall style of the image is warm and celebratory, with a color scheme that includes shades of pink, green, and white, which contribute to a joyful and romantic atmosphere.'],
+                ['The image is a digital illustration with a light blue background. At the top, there is a logo consisting of a snake wrapped around a staff, which is a common symbol in healthcare. Below the logo, the text "International Nurses Day" is prominently displayed in white, with the date "12 May 20xx" in smaller font size.The central part of the image features two stylized characters. On the left, there is a female character with dark hair, wearing a white nurse\'s uniform with a cap. She is holding a clipboard and appears to be speaking or gesturing, as indicated by a speech bubble with the word "OK" in it. On the right, there is a male character with light brown hair, wearing a light blue shirt with a white collar and a white apron. He is holding a stethoscope to his ear, suggesting he is a doctor or a healthcare professional.The characters are depicted in a friendly and approachable manner, with smiles on their faces. Around them, there are small blue plus signs, which are often associated with healthcare and medical services. The overall style of the image is clean, modern, and appears to be designed to celebrate International Nurses Day.'],
+                ['The image features a graphic design with a festive theme. At the top, there is a decorative border with a wavy pattern. Below this border, the text "WINTER SEASON SPECIAL COOKIES" is prominently displayed in a bold, sans-serif font. The text is black with a slight shadow effect, giving it a three-dimensional appearance.In the center of the image, there are three illustrated gingerbread cookies. Each cookie has a smiling face with eyes, a nose, and a mouth, and they are colored in a warm, brown hue. The cookies are arranged in a staggered formation, with the middle cookie slightly higher than the others, creating a sense of depth.At the bottom of the image, there is a call to action that reads "ORDER.NOW" in a large, bold, sans-serif font. The text is colored in a darker shade of brown, contrasting with the lighter background. The overall style of the image suggests it is an advertisement or promotional graphic for a winter-themed cookie special.']
+            ],
+            inputs=[intention_input]
+        )
+        gr.HTML("<h2>Anonymous Region Transformer</h2>")
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(lines=10, placeholder="Enter prompt text", label="Prompt")
+                tuple_input = gr.Textbox(lines=5, placeholder="Enter list of tuples, e.g., [(1, 2, 3, 4), (5, 6, 7, 8)]", label="Validation Box")
+                with gr.Row():
+                    true_gs_input=gr.Slider(minimum=3.0, maximum=5.0, step=0.1, label="true_gs", value=3.5)
+                    inference_steps_input=gr.Slider(minimum=5, maximum=50, step=1, label="inference_steps", value=28)
+                with gr.Row():
+                    seed_input = gr.Number(label="Seed", value=42)
+                with gr.Row():
+                    transfer_btn2 = gr.Button("Import from above")
+                with gr.Row():
+                    clear_btn2 = gr.Button("Clear")
+                    model_btn2 = gr.Button("Commit", variant='primary')
+            with gr.Column():
+                result_images = gr.Gallery(label="Result Images", columns=5, height='auto')
+        gr.HTML("<h1>SVG Image</h1>")
+        svg_file = gr.File(label="Download SVG Image")
+        svg_editor = gr.HTML(label="Editable SVG Editor")
+        model_btn1.click(
+            fn=process_preddate,
+            inputs=[intention_input, temperature_input, top_p_input],
+            outputs=[list_box_output, text_input, tuple_input],
+            api_name="process_preddate"
+        )
+        clear_btn1.click(
+            fn=clear_inputs1,
+            inputs=[],
+            outputs=[intention_input, list_box_output]
+        )
+        model_btn2.click(
+            fn=process_svg,
+            inputs=[text_input, tuple_input, seed_input, true_gs_input, inference_steps_input],
+            outputs=[result_images, svg_file, svg_editor],
+            api_name="process_svg"
+        )
+        clear_btn2.click(
+            fn=clear_inputs2,
+            inputs=[],
+            outputs=[text_input, tuple_input]
+        )
+        transfer_btn1.click(
+            fn=transfer_inputs,
+            inputs=[intention_input, list_box_output],
+            outputs=[text_input, tuple_input]
+        )
+        transfer_btn2.click(
+            fn=transfer_inputs,
+            inputs=[intention_input, list_box_output],
+            outputs=[text_input, tuple_input]
+        )
+        one_click_btn.click(
+            fn=one_click_generate,
+            inputs=[intention_input, temperature_input, top_p_input, seed_input, true_gs_input, inference_steps_input],
+            outputs=[list_box_output, result_images, svg_file, svg_editor, text_input, tuple_input]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    main()

config/base.py ADDED Viewed

	@@ -0,0 +1,31 @@

+### Model Settings
+pretrained_model_name_or_path = "black-forest-labs/FLUX.1-dev"
+revision = None
+variant = None
+cache_dir = None
+### Training Settings
+seed = 42
+report_to = "wandb"
+tracker_project_name = "multilayer"
+wandb_job_name = "YOU_FORGET_TO_SET"
+logging_dir = "logs"
+max_train_steps = None
+checkpoints_total_limit = None
+# gpu
+allow_tf32 = True
+gradient_checkpointing = True
+mixed_precision = "bf16"
+### Validation Settings
+num_validation_images = 1
+validation_steps = 5
+validation_prompts = [
+    "The image features a simple, flat design with a solid pink background. On the left side, there is a stylized depiction of a decorated egg with a pattern of alternating white and light blue stripes. The egg has a smooth, oval shape and is outlined with a thin line. In the center of the image, there is a floral arrangement consisting of a large, white flower with a green center and several smaller white flowers with green centers. The flowers are connected by thin green stems and leaves, creating a small bouquet. On the right side of the image, there is another egg similar to the one on the left, with the same pattern of stripes. This egg is also outlined with a thin line and has a smooth, oval shape. The overall style of the image is clean and modern, with a limited color palette and a focus on geometric shapes and simple patterns. There are no texts or additional elements in the image.",
+    "The image features a cartoon-style illustration with three characters against a blue background. On the left side, there is a green, goblin-like creature with large, expressive eyes and a wide grin. It has a small body and is standing upright with its arms raised in a welcoming or excited gesture. In the center, there is a large, white, egg-shaped object that appears to be floating or resting on the surface. It has a smooth, rounded shape and is the largest object in the image. On the right side, there is a purple dinosaur with a friendly expression. It has a small head, large eyes, and a wide mouth that seems to be smiling. The dinosaur is standing on all fours and appears to be looking towards the viewer. The overall style of the image is playful and whimsical, with a clear emphasis on the characters rather than any specific background details.",
+    "The image features a collection of Christmas-themed objects against a solid green background. On the left side, there is a red Christmas ornament with a white pattern, resembling a traditional Christmas ball. Next to it, there is a red and white striped stocking with a small white cuff at the top. On the right side, there is a cartoon-style depiction of Santa Claus' face, with a white beard, red cheeks, and a smiling expression. The Santa face is stylized with simple lines and shapes, giving it a friendly and festive appearance. The overall style of the image is flat and graphic, with a clear focus on holiday-related items.",
+    "The image depicts a stylized illustration of a rocket launch. The rocket, which is the central focus of the image, is depicted in a simplified, cartoon-like style with a white body and a pointed nose cone. It is shown ascending into a dark background, which is likely meant to represent the night sky. Above the rocket, there are several small, golden stars scattered across the sky, adding a sense of motion and direction to the rocket's ascent. The stars are of varying sizes and are positioned at different heights, creating a sense of depth and distance. The overall style of the image is minimalist and modern, with a limited color palette that emphasizes the rocket and the stars against the dark background. The image does not contain any text or additional elements that would provide context or narrative beyond the depiction of the rocket launch.",
+    "The image features a stylized, cartoon-like depiction of a bear. The bear is predominantly pink with a lighter pink nose and a small black dot for an eye. It has two small ears and a small black line for a mouth. The bear is standing upright and appears to be holding a yellow object, possibly a piece of paper or a card, in its right paw. To the right of the bear, there is a purple background with a large, heart-shaped doodle. The overall style of the image is simplistic and child-friendly, with a limited color palette and a clear, uncluttered composition.",
+    "The image features three ice cream cones against a pink background. Each cone is filled with a different flavor of ice cream: the leftmost cone has chocolate ice cream, the middle cone has vanilla ice cream, and the rightmost cone has strawberry ice cream. The ice cream is topped with a drizzle of the respective flavor's syrup, and each cone is adorned with a small, round, chocolate-covered piece of candy. The image also contains text that reads 'Sprinkle Sunday Ice Cream Factory East Avenue, CA 13154' and a phone number '+799-2324-9890'. Additionally, there is a website address 'www.sprinklesunday.com'. The style of the image is illustrative and appears to be designed for advertising or promotional purposes.",
+]

config/v04sv03_lora_r64_upto50layers_bs1_lr1_prodigy_800k_wds_512_filtered_10ep_none_8gpu.py ADDED Viewed

	@@ -0,0 +1,111 @@

+_base_ = "./base.py"
+### path & device settings
+img_tar_path = "/openseg_blob/puyifan/shared_data/CANVA_802000_resolution512max21760tokens/"
+output_path_base = "/openseg_blob/zhaoym/multi_layer_sd3/work_dirs/"
+cache_dir = "/openseg_blob/zhaoym/pretrained/flux"
+# transformer_varient = "ashen0209/Flux-Dev2Pro"
+pretrained_lora_dir = "/openseg_blob/zhaoym/sd3/work_dirs/canva500k_mix100k_sft_flux"
+total_gpu_num = 8
+### wandb settings
+wandb_job_name = "flux_" + '{{fileBasenameNoExtension}}'
+### Dataset Settings
+resolution = 512
+dataloader_pin_memory = True
+dataloader_num_workers = 16
+train_batch_size = 1
+dataset_cfg = dict(
+    img_tar_path=img_tar_path,
+    num_train_examples=802000,
+    per_gpu_batch_size=train_batch_size,
+    global_batch_size=(train_batch_size * total_gpu_num),
+    num_workers=dataloader_num_workers,
+    resolution=resolution,
+    center_crop=True,
+    random_flip=False,
+    shuffle_buffer_size=1000,
+    pin_memory=dataloader_pin_memory,
+    persistent_workers=True,
+)
+### Model Settings
+rank = 64
+text_encoder_rank = 64
+train_text_encoder = False
+max_layer_num = 50 + 2
+learnable_proj = True
+### Training Settings
+weighting_scheme = "none"
+logit_mean = 0.0
+logit_std = 1.0
+mode_scale = 1.29
+guidance_scale = 1.0 ###IMPORTANT
+layer_weighting = 5.0
+# steps
+# train_batch_size = 1
+num_train_epochs = 1
+max_train_steps = None
+checkpointing_steps = 2000
+resume_from_checkpoint = "latest"
+gradient_accumulation_steps = 1
+# lr
+optimizer = "prodigy"
+learning_rate = 1.0
+scale_lr = False
+lr_scheduler = "constant"
+lr_warmup_steps = 0
+lr_num_cycles = 1
+lr_power = 1.0
+# optim
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_weight_decay = 1e-3
+adam_epsilon = 1e-8
+prodigy_beta3 = None
+prodigy_decouple = True
+prodigy_use_bias_correction = True
+prodigy_safeguard_warmup = True
+max_grad_norm = 1.0
+# logging
+tracker_task_name = '{{fileBasenameNoExtension}}'
+output_dir = output_path_base + "{{fileBasenameNoExtension}}"
+### Validation Settings
+num_validation_images = 1
+validation_steps = 2000
+validation_prompts = [
+    'The image features a background with a soft, pastel color gradient that transitions from pink to purple. There are abstract floral elements scattered throughout the background, with some appearing to be in full bloom and others in a more delicate, bud-like state. The flowers have a watercolor effect, with soft edges that blend into the background.\n\nCentered in the image is a quote in a serif font that reads, "You\'re free to be different." The text is black, which stands out against the lighter background. The overall style of the image is artistic and inspirational, with a motivational message that encourages individuality and self-expression. The image could be used for motivational purposes, as a background for a blog or social media post, or as part of a personal development or self-help theme.',
+    'The image features a logo for a company named "Bull Head Party Adventure." The logo is stylized with a cartoon-like depiction of a bull\'s head, which is the central element of the design. The bull has prominent horns and a fierce expression, with its mouth slightly open as if it\'s snarling or roaring. The color scheme of the bull is a mix of brown and beige tones, with the horns highlighted in a lighter shade.\n\nBelow the bull\'s head, the company name is written in a bold, sans-serif font. The text is arranged in two lines, with "Bull Head" on the top line and "Party Adventure" on the bottom line. The font color matches the color of the bull, creating a cohesive look. The overall style of the image is playful and energetic, suggesting that the company may offer exciting or adventurous party experiences.',
+    'The image features a festive and colorful illustration with a theme related to the Islamic holiday of Eid al-Fitr. At the center of the image is a large, ornate crescent moon with intricate patterns and decorations. Surrounding the moon are several smaller stars and crescents, also adorned with decorative elements. These smaller celestial motifs are suspended from the moon, creating a sense of depth and dimension.\n\nBelow the central moon, there is a banner with the text "Eid Mubarak" in a stylized, elegant font. The text is in a bold, dark color that stands out against the lighter background. The background itself is a gradient of light to dark green, which complements the golden and white hues of the celestial motifs.\n\nThe overall style of the image is celebratory and decorative, with a focus on the traditional symbols associated with Eid al-Fitr. The use of gold and white gives the image a luxurious and festive feel, while the green background is a color often associated with Islam. The image appears to be a digital artwork or graphic design, possibly intended for use as a greeting card or a festive decoration.',
+    'The image is a festive graphic with a dark background. At the center, there is a large, bold text that reads "Happy New Year 2023" in a combination of white and gold colors. The text is surrounded by numerous white balloons with gold ribbons, giving the impression of a celebratory atmosphere. The balloons are scattered around the text, creating a sense of depth and movement. Additionally, there are small gold sparkles and confetti-like elements that add to the celebratory theme. The overall design suggests a New Year\'s celebration, with the year 2023 being the focal point.',
+    'The image is a stylized illustration with a flat design aesthetic. It depicts a scene related to healthcare or medical care. In the center, there is a hospital bed with a patient lying down, appearing to be resting or possibly receiving treatment. The patient is surrounded by three individuals who seem to be healthcare professionals or caregivers. They are standing around the bed, with one on each side and one at the foot of the bed. The person at the foot of the bed is holding a clipboard, suggesting they might be taking notes or reviewing medical records.\n\nThe room has a window with curtains partially drawn, allowing some light to enter. The color palette is soft, with pastel tones dominating the scene. The text "INTERNATIONAL CANCER DAY" is prominently displayed at the top of the image, indicating that the illustration is related to this event. The overall impression is one of care and support, with a focus on the patient\'s well-being.',
+    'The image features a stylized illustration of a man with a beard and a tank top, drinking from a can. The man is depicted in a simplified, cartoon-like style with a limited color palette. Above him, there is a text that reads "Happy Eating, Friends" in a bold, friendly font. Below the illustration, there is another line of text that states "Food is a Necessity That is Not Prioritized," which is also in a bold, sans-serif font. The background of the image is a gradient of light to dark blue, giving the impression of a sky or a calm, serene environment. The overall style of the image is casual and approachable, with a focus on the message conveyed by the text.',
+    'The image is a digital illustration with a pastel pink background. At the top, there is a text that reads "Sending you my Easter wishes" in a simple, sans-serif font. Below this, a larger text states "May Your Heart be Happy!" in a more decorative, serif font. Underneath this main message, there is a smaller text that says "Let the miracle of the season fill you with hope and love."\n\nThe illustration features three stylized flowers with smiling faces. On the left, there is a purple flower with a yellow center. In the center, there is a blue flower with a green center. On the right, there is a pink flower with a yellow center. Each flower has a pair of eyes and a mouth, giving them a friendly appearance. The flowers are drawn with a cartoon-like style, using solid colors and simple shapes.\n\nThe overall style of the image is cheerful and whimsical, with a clear Easter theme suggested by the text and the presence of flowers, which are often associated with spring and new beginnings.',
+    'The image is a vibrant and colorful graphic with a pink background. In the center, there is a photograph of a man and a woman embracing each other. The man is wearing a white shirt, and the woman is wearing a patterned top. They are both smiling and appear to be in a joyful mood.\n\nSurrounding the photograph are various elements that suggest a festive or celebratory theme. There are three hot air balloons in the background, each with a different design: one with a heart, one with a gift box, and one with a basket. These balloons are floating against a clear sky.\n\nAdditionally, there are two gift boxes with ribbons, one on the left and one on the right side of the image. These gift boxes are stylized with a glossy finish and are placed at different heights, creating a sense of depth.\n\nAt the bottom of the image, there is a large red heart, which is a common symbol associated with love and Valentine\'s Day.\n\nFinally, at the very bottom of the image, there is a text that reads "Happy Valentine\'s Day," which confirms the theme of the image as a Valentine\'s Day greeting. The text is in a playful, cursive font that matches the overall cheerful and romantic tone of the image.',
+    'The image depicts a stylized illustration of two women sitting on stools, engaged in conversation. They are wearing traditional attire, with headscarves and patterned dresses. The woman on the left is wearing a brown dress with a purple pattern, while the woman on the right is wearing a purple dress with a brown pattern. Between them is a purple flower. Above the women, the text "INTERNATIONAL WOMEN\'S DAY" is written in bold, uppercase letters. The background is a soft, pastel pink, and there are abstract, swirling lines in a darker shade of pink above the women. The overall style of the image is simplistic and cartoonish, with a warm and friendly tone.',
+    'The image is a digital graphic with a clean, minimalist design. It features a light blue background with a subtle floral pattern at the bottom. On the left side, there is a large, bold text that reads "Our Global Idea." The text is in a serif font and is colored in a darker shade of blue, creating a contrast against the lighter background.\n\nOn the right side, there is a smaller text in a sans-serif font that provides information about utilizing the Live Q&A feature of Canva. The text suggests using this feature to engage an audience more effectively, such as asking about their opinions on certain topics and themes. The text is in a lighter shade of blue, which matches the background, and it is enclosed within a decorative border that includes a floral motif, mirroring the design at the bottom of the image.\n\nThe overall style of the image is professional and modern, with a focus on typography and a simple color scheme. The design elements are well-balanced, with the text and decorative elements complementing each other without overwhelming the viewer.',
+    'The image is a stylized illustration with a warm, peach-colored background. At the center, there is a vintage-style radio with a prominent dial and antenna. The radio is emitting a blue, star-like burst of light or energy from its top. Surrounding the radio are various objects and elements that seem to be floating or suspended in the air. These include a brown, cone-shaped object, a blue, star-like shape, and a brown, wavy, abstract shape that could be interpreted as a flower or a wave.\n\nAt the top of the image, there is text that reads "World Radio Day" in a bold, serif font. Below this, in a smaller, sans-serif font, is the date "13 February 2022." The overall style of the image is playful and cartoonish, with a clear focus on celebrating World Radio Day.',
+    'The image is a graphic design of a baby shower invitation. The central focus is a cute, cartoon-style teddy bear with a friendly expression, sitting upright. The bear is colored in a soft, light brown hue. Above the bear, there is a bold text that reads "YOU\'RE INVITED" in a playful, sans-serif font. Below this, the words "BABY SHOWER" are prominently displayed in a larger, more decorative font, suggesting the theme of the event.\n\nThe background of the invitation is a soft, light pink color, which adds to the gentle and welcoming atmosphere of the design. At the bottom of the image, there is additional text providing specific details about the event. It reads "27 January, 2022 - 8:00 PM" followed by "FAUGET INDUSTRIES CAFE," indicating the date, time, and location of the baby shower.\n\nThe overall style of the image is warm, inviting, and child-friendly, with a clear focus on the theme of a baby shower celebration. The use of a teddy bear as the central image reinforces the baby-related theme. The design is simple yet effective, with a clear hierarchy of information that guides the viewer\'s attention from the top to the bottom of the invitation.',
+]
+validation_boxes = [
+    [(0, 0, 512, 512), (0, 0, 512, 512), (368, 0, 512, 272), (0, 272, 112, 512), (160, 208, 352, 304)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (128, 128, 384, 304), (96, 288, 416, 336), (128, 336, 384, 368)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (112, 48, 400, 368), (0, 48, 96, 176), (128, 336, 384, 384), (240, 384, 384, 432)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (32, 32, 480, 480), (80, 176, 432, 368), (64, 176, 448, 224), (144, 96, 368, 224)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (0, 64, 176, 272), (0, 400, 512, 512), (16, 160, 496, 512), (224, 48, 464, 112), (208, 96, 464, 160)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (112, 224, 512, 512), (0, 0, 240, 160), (144, 144, 512, 512), (48, 64, 432, 208), (48, 400, 256, 448)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (160, 48, 352, 80), (64, 80, 448, 192), (128, 208, 384, 240), (320, 240, 512, 512), (80, 272, 368, 512), (0, 224, 192, 512)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (48, 0, 464, 304), (128, 144, 384, 400), (288, 288, 384, 368), (336, 304, 400, 368), (176, 432, 336, 480), (224, 400, 288, 432)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (32, 288, 448, 512), (144, 176, 336, 400), (224, 208, 272, 256), (160, 128, 336, 192), (192, 368, 304, 400), (368, 80, 448, 224), (48, 160, 128, 256)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (0, 112, 112, 240), (400, 272, 512, 416), (400, 112, 512, 240), (0, 272, 112, 400), (64, 192, 176, 320), (224, 192, 432, 320), (224, 304, 448, 368)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (0, 352, 512, 512), (112, 176, 368, 432), (48, 176, 128, 256), (48, 368, 128, 448), (384, 192, 480, 272), (384, 336, 432, 384), (80, 80, 432, 128), (176, 128, 336, 160)],
+    [(0, 0, 512, 512), (0, 0, 512, 512), (0, 0, 512, 352), (144, 384, 368, 448), (160, 192, 352, 432), (368, 0, 512, 144), (0, 0, 144, 144), (128, 80, 384, 208), (128, 448, 384, 496), (176, 48, 336, 80)],
+]

custom_model_mmdit.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import torch
+import torch.nn as nn
+from typing import Any, Dict, List, Optional, Union, Tuple
+from accelerate.utils import set_module_tensor_to_device
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.normalization import AdaLayerNormContinuous
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel, FluxTransformerBlock, FluxSingleTransformerBlock
+from diffusers.configuration_utils import register_to_config
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CustomFluxTransformer2DModel(FluxTransformer2DModel):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+        max_layer_num: int = 10,
+    ):
+        super(FluxTransformer2DModel, self).__init__()
+        self.out_channels = in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+        self.max_layer_num = max_layer_num
+        # the following process ensures self.layer_pe is not created as a meta tensor
+        self.layer_pe = nn.Parameter(torch.empty(1, self.max_layer_num, 1, 1, self.inner_dim))
+        nn.init.trunc_normal_(self.layer_pe, mean=0.0, std=0.02, a=-2.0, b=2.0)
+        # layer_pe_value = nn.init.trunc_normal_(
+        #     nn.Parameter(torch.zeros(
+        #         1, self.max_layer_num, 1, 1, self.inner_dim,
+        #     )),
+        #     mean=0.0, std=0.02, a=-2.0, b=2.0,
+        # ).data.detach()
+        # self.layer_pe = nn.Parameter(layer_pe_value)
+        # set_module_tensor_to_device(
+        #     self,
+        #     'layer_pe',
+        #     device='cpu',
+        #     value=layer_pe_value,
+        #     dtype=layer_pe_value.dtype,
+        # )
+    @classmethod
+    def from_pretrained(cls, *args, **kwarg):
+        model = super().from_pretrained(*args, **kwarg)
+        for name, para in model.named_parameters():
+            if name != 'layer_pe':
+                device = para.device
+                break
+        model.layer_pe.to(device)
+        return model
+    def crop_each_layer(self, hidden_states, list_layer_box):
+        """
+            hidden_states: [1, n_layers, h, w, inner_dim]
+            list_layer_box: List, length=n_layers, each element is a Tuple of 4 elements (x1, y1, x2, y2)
+        """
+        token_list = []
+        for layer_idx in range(hidden_states.shape[1]):
+            if list_layer_box[layer_idx] == None:
+                continue
+            else:
+                x1, y1, x2, y2 = list_layer_box[layer_idx]
+                x1, y1, x2, y2 = x1 // 16, y1 // 16, x2 // 16, y2 // 16
+                layer_token = hidden_states[:, layer_idx, y1:y2, x1:x2, :]
+                bs, h, w, c = layer_token.shape
+                layer_token = layer_token.reshape(bs, -1, c)
+                token_list.append(layer_token)
+        result = torch.cat(token_list, dim=1)
+        return result
+    def fill_in_processed_tokens(self, hidden_states, full_hidden_states, list_layer_box):
+        """
+            hidden_states: [1, h1xw1 + h2xw2 + ... + hlxwl , inner_dim]
+            full_hidden_states: [1, n_layers, h, w, inner_dim]
+            list_layer_box: List, length=n_layers, each element is a Tuple of 4 elements (x1, y1, x2, y2)
+        """
+        used_token_len = 0
+        bs = hidden_states.shape[0]
+        for layer_idx in range(full_hidden_states.shape[1]):
+            if list_layer_box[layer_idx] == None:
+                continue
+            else:
+                x1, y1, x2, y2 = list_layer_box[layer_idx]
+                x1, y1, x2, y2 = x1 // 16, y1 // 16, x2 // 16, y2 // 16
+                full_hidden_states[:, layer_idx, y1:y2, x1:x2, :] = hidden_states[:, used_token_len: used_token_len + (y2-y1) * (x2-x1), :].reshape(bs, y2-y1, x2-x1, -1)
+                used_token_len = used_token_len + (y2-y1) * (x2-x1)
+        return full_hidden_states
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        list_layer_box: List[Tuple] = None,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        bs, n_layers, channel_latent, height, width = hidden_states.shape  # [bs, n_layers, c_latent, h, w]
+        hidden_states = hidden_states.view(bs, n_layers, channel_latent, height // 2, 2, width // 2, 2)  # [bs, n_layers, c_latent, h/2, 2, w/2, 2]
+        hidden_states = hidden_states.permute(0, 1, 3, 5, 2, 4, 6) # [bs, n_layers, h/2, w/2, c_latent, 2, 2]
+        hidden_states = hidden_states.reshape(bs, n_layers, height // 2, width // 2, channel_latent * 4) # [bs, n_layers, h/2, w/2, c_latent*4]
+        hidden_states = self.x_embedder(hidden_states) # [bs, n_layers, h/2, w/2, inner_dim]
+        full_hidden_states = torch.zeros_like(hidden_states) # [bs, n_layers, h/2, w/2, inner_dim]
+        layer_pe = self.layer_pe.view(1, self.max_layer_num, 1, 1, self.inner_dim)  # [1, max_n_layers, 1, 1, inner_dim]
+        hidden_states = hidden_states + layer_pe[:, :n_layers]    # [bs, n_layers, h/2, w/2, inner_dim] + [1, n_layers, 1, 1, inner_dim] -->  [bs, f, h/2, w/2, inner_dim]
+        hidden_states = self.crop_each_layer(hidden_states, list_layer_box)  # [bs, token_len, inner_dim]
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.fill_in_processed_tokens(hidden_states, full_hidden_states, list_layer_box)  # [bs, n_layers, h/2, w/2, inner_dim]
+        hidden_states = hidden_states.view(bs, -1, self.inner_dim)  # [bs, n_layers * full_len, inner_dim]
+        hidden_states = self.norm_out(hidden_states, temb) # [bs, n_layers * full_len, inner_dim]
+        hidden_states = self.proj_out(hidden_states) # [bs, n_layers * full_len, c_latent*4]
+        # unpatchify
+        hidden_states = hidden_states.view(bs, n_layers, height//2, width//2, channel_latent, 2, 2) # [bs, n_layers, h/2, w/2, c_latent, 2, 2]
+        hidden_states = hidden_states.permute(0, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.reshape(bs, n_layers, channel_latent, height, width)  # [bs, n_layers, c_latent, h, w]
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

custom_model_transp_vae.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import einops
+from collections import OrderedDict
+from functools import partial
+from typing import Callable
+import torch
+import torch.nn as nn
+import torchvision
+from torch.utils.checkpoint import checkpoint
+from accelerate.utils import set_module_tensor_to_device
+from diffusers.models.embeddings import apply_rotary_emb, FluxPosEmbed
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.loaders import FromOriginalModelMixin
+class MLPBlock(torchvision.ops.misc.MLP):
+    """Transformer MLP block."""
+    _version = 2
+    def __init__(self, in_dim: int, mlp_dim: int, dropout: float):
+        super().__init__(in_dim, [mlp_dim, in_dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.normal_(m.bias, std=1e-6)
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Replacing legacy MLPBlock with MLP. See https://github.com/pytorch/vision/pull/6053
+            for i in range(2):
+                for type in ["weight", "bias"]:
+                    old_key = f"{prefix}linear_{i+1}.{type}"
+                    new_key = f"{prefix}{3*i}.{type}"
+                    if old_key in state_dict:
+                        state_dict[new_key] = state_dict.pop(old_key)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+class EncoderBlock(nn.Module):
+    """Transformer encoder block."""
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        # Attention block
+        self.ln_1 = norm_layer(hidden_dim)
+        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        # MLP block
+        self.ln_2 = norm_layer(hidden_dim)
+        self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout)
+    def forward(self, input: torch.Tensor, freqs_cis):
+        torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
+        B, L, C = input.shape
+        x = self.ln_1(input)
+        if freqs_cis is not None:
+            query = x.view(B, L, self.num_heads, self.hidden_dim // self.num_heads).transpose(1, 2)
+            query = apply_rotary_emb(query, freqs_cis)
+            query = query.transpose(1, 2).reshape(B, L, self.hidden_dim)
+        x, _ = self.self_attention(query, query, x, need_weights=False)
+        x = self.dropout(x)
+        x = x + input
+        y = self.ln_2(x)
+        y = self.mlp(y)
+        return x + y
+class Encoder(nn.Module):
+    """Transformer Model Encoder for sequence to sequence translation."""
+    def __init__(
+        self,
+        seq_length: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        # Note that batch_size is on the first dim because
+        # we have batch_first=True in nn.MultiAttention() by default
+        # self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02))  # from BERT
+        self.dropout = nn.Dropout(dropout)
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        for i in range(num_layers):
+            layers[f"encoder_layer_{i}"] = EncoderBlock(
+                num_heads,
+                hidden_dim,
+                mlp_dim,
+                dropout,
+                attention_dropout,
+                norm_layer,
+            )
+        self.layers = nn.Sequential(layers)
+        self.ln = norm_layer(hidden_dim)
+    def forward(self, input: torch.Tensor, freqs_cis):
+        torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
+        input = input # + self.pos_embedding
+        x = self.dropout(input)
+        for l in self.layers:
+            x = checkpoint(l, x, freqs_cis)
+        x = self.ln(x)
+        return x
+class ViTEncoder(nn.Module):
+    def __init__(self, arch='vit-b/32'):
+        super().__init__()
+        self.arch = arch
+        if self.arch == 'vit-b/32':
+            ch = 768
+            layers = 12
+            heads = 12
+        elif self.arch == 'vit-h/14':
+            ch = 1280
+            layers = 32
+            heads = 16
+        self.encoder = Encoder(
+            seq_length=-1,
+            num_layers=layers,
+            num_heads=heads,
+            hidden_dim=ch,
+            mlp_dim=ch*4,
+            dropout=0.0,
+            attention_dropout=0.0,
+        )
+        self.fc_in = nn.Linear(16, ch)
+        self.fc_out = nn.Linear(ch, 256)
+        if self.arch == 'vit-b/32':
+            from torchvision.models.vision_transformer import vit_b_32, ViT_B_32_Weights
+            vit = vit_b_32(weights=ViT_B_32_Weights.DEFAULT)
+        elif self.arch == 'vit-h/14':
+            from torchvision.models.vision_transformer import vit_h_14, ViT_H_14_Weights
+            vit = vit_h_14(weights=ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1)
+        missing_keys, unexpected_keys = self.encoder.load_state_dict(vit.encoder.state_dict(), strict=False)
+        if len(missing_keys) > 0 or len(unexpected_keys) > 0:
+            print(f"ViT Encoder Missing keys: {missing_keys}")
+            print(f"ViT Encoder Unexpected keys: {unexpected_keys}")
+        del vit
+    def forward(self, x, freqs_cis):
+        out = self.fc_in(x)
+        out = self.encoder(out, freqs_cis)
+        out = checkpoint(self.fc_out, out)
+        return out
+def patchify(x, patch_size=8):
+    if len(x.shape) == 4:
+        bs, c, h, w = x.shape
+        x = einops.rearrange(x, "b c (h p1) (w p2) -> b (c p1 p2) h w", p1=patch_size, p2=patch_size)
+    elif len(x.shape) == 3:
+        c, h, w = x.shape
+        x = einops.rearrange(x, "c (h p1) (w p2) -> (c p1 p2) h w", p1=patch_size, p2=patch_size)
+    return x
+def unpatchify(x, patch_size=8):
+    if len(x.shape) == 4:
+        bs, c, h, w = x.shape
+        x = einops.rearrange(x, "b (c p1 p2) h w -> b c (h p1) (w p2)", p1=patch_size, p2=patch_size)
+    elif len(x.shape) == 3:
+        c, h, w = x.shape
+        x = einops.rearrange(x, "(c p1 p2) h w -> c (h p1) (w p2)", p1=patch_size, p2=patch_size)
+    return x
+def crop_each_layer(hidden_states, use_layers, list_layer_box, H, W, pos_embedding):
+    token_list = []
+    cos_list, sin_list = [], []
+    for layer_idx in range(hidden_states.shape[1]):
+        if list_layer_box[layer_idx] is None:
+            continue
+        else:
+            x1, y1, x2, y2 = list_layer_box[layer_idx]
+            x1, y1, x2, y2 = x1 // 8, y1 // 8, x2 // 8, y2 // 8
+            layer_token = hidden_states[:, layer_idx, y1:y2, x1:x2]
+            c, h, w = layer_token.shape
+            layer_token = layer_token.reshape(c, -1)
+            token_list.append(layer_token)
+            ids = prepare_latent_image_ids(-1, H * 2, W * 2, hidden_states.device, hidden_states.dtype)
+            ids[:, 0] = use_layers[layer_idx]
+            image_rotary_emb = pos_embedding(ids)
+            pos_cos, pos_sin = image_rotary_emb[0].reshape(H, W, -1), image_rotary_emb[1].reshape(H, W, -1)
+            cos_list.append(pos_cos[y1:y2, x1:x2].reshape(-1, 64))
+            sin_list.append(pos_sin[y1:y2, x1:x2].reshape(-1, 64))
+    token_list = torch.cat(token_list, dim=1).permute(1, 0)
+    cos_list = torch.cat(cos_list, dim=0)
+    sin_list = torch.cat(sin_list, dim=0)
+    return token_list, (cos_list, sin_list)
+def prepare_latent_image_ids(batch_size, height, width, device, dtype):
+    latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+class AutoencoderKLTransformerTraining(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    def __init__(self):
+        super().__init__()
+        self.decoder_arch = 'vit'
+        self.layer_embedding = 'rope'
+        self.decoder = ViTEncoder()
+        self.pos_embedding = FluxPosEmbed(theta=10000, axes_dim=(8, 28, 28))
+        if 'rel' in self.layer_embedding or 'abs' in self.layer_embedding:
+            self.layer_embedding = nn.Parameter(torch.empty(16, 2 + self.max_layers, 1, 1).normal_(std=0.02), requires_grad=True)
+        def zero_module(module):
+            """
+            Zero out the parameters of a module and return it.
+            """
+            for p in module.parameters():
+                p.detach().zero_()
+            return module
+    def encode(self, z_2d, box, use_layers):
+        B, C, T, H, W = z_2d.shape
+        z, freqs_cis = [], []
+        for b in range(B):
+            _z = z_2d[b]
+            if 'vit' in self.decoder_arch:
+                _use_layers = torch.tensor(use_layers[b], device=z_2d.device)
+                if 'rel' in self.layer_embedding:
+                    _use_layers[_use_layers > 2] = 2
+                if 'rel' in self.layer_embedding or 'abs' in self.layer_embedding:
+                    _z = _z + self.layer_embedding[:, _use_layers] # + self.pos_embedding
+            if 'rope' not in self.layer_embedding:
+                use_layers[b] = [0] * len(use_layers[b])
+            _z, cis = crop_each_layer(_z, use_layers[b], box[b], H, W, self.pos_embedding) ### modified
+            z.append(_z)
+            freqs_cis.append(cis)
+        return z, freqs_cis
+    def decode(self, z, freqs_cis, box, H, W):
+        B = len(z)
+        pad = torch.zeros(4, H, W, device=z[0].device, dtype=z[0].dtype)
+        pad[3, :, :] = -1
+        x = []
+        for b in range(B):
+            _x = []
+            _z = self.decoder(z[b].unsqueeze(0), freqs_cis[b]).squeeze(0)
+            current_index = 0
+            for layer_idx in range(len(box[b])):
+                if box[b][layer_idx] == None:
+                    _x.append(pad.clone())
+                else:
+                    x1, y1, x2, y2 = box[b][layer_idx]
+                    x1_tok, y1_tok, x2_tok, y2_tok = x1 // 8, y1 // 8, x2 // 8, y2 // 8
+                    token_length = (x2_tok - x1_tok) * (y2_tok - y1_tok)
+                    tokens = _z[current_index:current_index + token_length]
+                    pixels = einops.rearrange(tokens, "(h w) c -> c h w", h=y2_tok - y1_tok, w=x2_tok - x1_tok)
+                    unpatched = unpatchify(pixels)
+                    pixels = pad.clone()
+                    pixels[:, y1:y2, x1:x2] = unpatched
+                    _x.append(pixels)
+                    current_index += token_length
+            _x = torch.stack(_x, dim=1)
+            x.append(_x)
+        x = torch.stack(x, dim=0)
+        return x
+    def forward(self, z_2d, box, use_layers=None):
+        z_2d = z_2d.transpose(0, 1).unsqueeze(0)
+        use_layers = use_layers or [list(range(z_2d.shape[2]))]
+        z, freqs_cis = self.encode(z_2d, box, use_layers)
+        H, W = z_2d.shape[-2:]
+        x_hat = self.decode(z, freqs_cis, box, H * 8, W * 8)
+        assert x_hat.shape[0] == 1, x_hat.shape
+        x_hat = einops.rearrange(x_hat[0], "c t h w -> t c h w")
+        x_hat_rgb, x_hat_alpha = x_hat[:, :3], x_hat[:, 3:]
+        return x_hat_rgb, x_hat_alpha

custom_pipeline.py ADDED Viewed

	@@ -0,0 +1,845 @@

+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.nn as nn
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import is_torch_xla_available, logging
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps, FluxPipeline
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm # type: ignore
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def _get_clip_prompt_embeds(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]],
+    num_images_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
+):
+    device = device or text_encoder.device
+    dtype = text_encoder.dtype
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=text_encoder.config.max_position_embeddings,
+        truncation=True,
+        return_overflowing_tokens=False,
+        return_length=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def _get_t5_prompt_embeds(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]] = None,
+    num_images_per_prompt: int = 1,
+    max_sequence_length: int = 512,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+    device = device or text_encoder.device
+    dtype = dtype or text_encoder.dtype
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def encode_prompt(
+    tokenizers,
+    text_encoders,
+    prompt: Union[str, List[str]],
+    prompt_2: Union[str, List[str]] = None,
+    num_images_per_prompt: int = 1,
+    max_sequence_length: int = 512,
+):
+    tokenizer_1, tokenizer_2 = tokenizers
+    text_encoder_1, text_encoder_2 = text_encoders
+    device = text_encoder_1.device
+    dtype = text_encoder_1.dtype
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_2 = prompt_2 or prompt
+    prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+    # We only use the pooled prompt output from the CLIPTextModel
+    pooled_prompt_embeds = _get_clip_prompt_embeds(
+        tokenizer=tokenizer_1,
+        text_encoder=text_encoder_1,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+    )
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer=tokenizer_2,
+        text_encoder=text_encoder_2,
+        prompt=prompt_2,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+class CustomFluxPipeline(FluxPipeline):
+    @staticmethod
+    def _prepare_latent_image_ids(height, width, list_layer_box, device, dtype):
+        latent_image_ids_list = []
+        for layer_idx in range(len(list_layer_box)):
+            if list_layer_box[layer_idx] == None:
+                continue
+            else:
+                latent_image_ids = torch.zeros(height // 2, width // 2, 3) # [h/2, w/2, 3]
+                latent_image_ids[..., 0] = layer_idx # use the first dimension for layer representation
+                latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+                latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+                x1, y1, x2, y2 = list_layer_box[layer_idx]
+                x1, y1, x2, y2 = x1 // 16, y1 // 16, x2 // 16, y2 // 16
+                latent_image_ids = latent_image_ids[y1:y2, x1:x2, :]
+                latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+                latent_image_ids = latent_image_ids.reshape(
+                    latent_image_id_height * latent_image_id_width, latent_image_id_channels
+                )
+                latent_image_ids_list.append(latent_image_ids)
+        full_latent_image_ids = torch.cat(latent_image_ids_list, dim=0)
+        return full_latent_image_ids.to(device=device, dtype=dtype)
+    def prepare_latents(
+        self,
+        batch_size,
+        num_layers,
+        num_channels_latents,
+        height,
+        width,
+        list_layer_box,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_layers, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # [bs, f, c_latent, h, w]
+        latent_image_ids = self._prepare_latent_image_ids(height, width, list_layer_box, device, dtype)
+        return latents, latent_image_ids
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        validation_box: List[tuple] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        num_layers: int = 5,
+        sdxl_vae: nn.Module = None,
+        transparent_decoder: nn.Module = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_layers,
+            num_channels_latents,
+            height,
+            width,
+            validation_box,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latent_image_ids.shape[0] # ???
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    list_layer_box=validation_box,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        # create a grey latent
+        bs, n_frames, channel_latent, height, width = latents.shape
+        pixel_grey = torch.zeros(size=(bs*n_frames, 3, height*8, width*8), device=latents.device, dtype=latents.dtype)
+        latent_grey = self.vae.encode(pixel_grey).latent_dist.sample()
+        latent_grey = (latent_grey - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        latent_grey = latent_grey.view(bs, n_frames, channel_latent, height, width)  # [bs, f, c_latent, h, w]
+        # fill in the latents
+        for layer_idx in range(latent_grey.shape[1]):
+            x1, y1, x2, y2 = validation_box[layer_idx]
+            x1, y1, x2, y2 = x1 // 8, y1 // 8, x2 // 8, y2 // 8
+            latent_grey[:, layer_idx, :, y1:y2, x1:x2] = latents[:, layer_idx, :, y1:y2, x1:x2]
+        latents = latent_grey
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            latents = latents.reshape(bs * n_frames, channel_latent, height, width)
+            image = self.vae.decode(latents, return_dict=False)[0]
+            if sdxl_vae is not None:
+                sdxl_vae = sdxl_vae.to(dtype=image.dtype, device=image.device)
+                sdxl_latents = sdxl_vae.encode(image).latent_dist.sample()
+                transparent_decoder = transparent_decoder.to(dtype=image.dtype, device=image.device)
+                result_list, vis_list = transparent_decoder(sdxl_vae, sdxl_latents)
+            else:
+                result_list, vis_list = None, None
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, result_list, vis_list)
+        return FluxPipelineOutput(images=image), result_list, vis_list
+class CustomFluxPipelineCfg(FluxPipeline):
+    @staticmethod
+    def _prepare_latent_image_ids(height, width, list_layer_box, device, dtype):
+        latent_image_ids_list = []
+        for layer_idx in range(len(list_layer_box)):
+            if list_layer_box[layer_idx] == None:
+                continue
+            else:
+                latent_image_ids = torch.zeros(height // 2, width // 2, 3) # [h/2, w/2, 3]
+                latent_image_ids[..., 0] = layer_idx # use the first dimension for layer representation
+                latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+                latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+                x1, y1, x2, y2 = list_layer_box[layer_idx]
+                x1, y1, x2, y2 = x1 // 16, y1 // 16, x2 // 16, y2 // 16
+                latent_image_ids = latent_image_ids[y1:y2, x1:x2, :]
+                latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+                latent_image_ids = latent_image_ids.reshape(
+                    latent_image_id_height * latent_image_id_width, latent_image_id_channels
+                )
+                latent_image_ids_list.append(latent_image_ids)
+        full_latent_image_ids = torch.cat(latent_image_ids_list, dim=0)
+        return full_latent_image_ids.to(device=device, dtype=dtype)
+    def prepare_latents(
+        self,
+        batch_size,
+        num_layers,
+        num_channels_latents,
+        height,
+        width,
+        list_layer_box,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_layers, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # [bs, n_layers, c_latent, h, w]
+        latent_image_ids = self._prepare_latent_image_ids(height, width, list_layer_box, device, dtype)
+        return latents, latent_image_ids
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        validation_box: List[tuple] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 3.5,
+        true_gs: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        num_layers: int = 5,
+        transparent_decoder: nn.Module = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        (
+            neg_prompt_embeds,
+            neg_pooled_prompt_embeds,
+            neg_text_ids,
+        ) = self.encode_prompt(
+            prompt="",
+            prompt_2=None,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_layers,
+            num_channels_latents,
+            height,
+            width,
+            validation_box,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latent_image_ids.shape[0]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    list_layer_box=validation_box,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                neg_noise_pred = self.transformer(
+                    hidden_states=latents,
+                    list_layer_box=validation_box,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=neg_pooled_prompt_embeds,
+                    encoder_hidden_states=neg_prompt_embeds,
+                    txt_ids=neg_text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = neg_noise_pred + true_gs * (noise_pred - neg_noise_pred)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        # create a grey latent
+        bs, n_layers, channel_latent, height, width = latents.shape
+        pixel_grey = torch.zeros(size=(bs*n_layers, 3, height*8, width*8), device=latents.device, dtype=latents.dtype)
+        latent_grey = self.vae.encode(pixel_grey).latent_dist.sample()
+        latent_grey = (latent_grey - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        latent_grey = latent_grey.view(bs, n_layers, channel_latent, height, width)  # [bs, n_layers, c_latent, h, w]
+        # fill in the latents
+        for layer_idx in range(latent_grey.shape[1]):
+            if validation_box[layer_idx] == None:
+                continue
+            x1, y1, x2, y2 = validation_box[layer_idx]
+            x1, y1, x2, y2 = x1 // 8, y1 // 8, x2 // 8, y2 // 8
+            latent_grey[:, layer_idx, :, y1:y2, x1:x2] = latents[:, layer_idx, :, y1:y2, x1:x2]
+        latents = latent_grey
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            latents = latents.reshape(bs * n_layers, channel_latent, height, width)
+            latents_segs = torch.split(latents, 16, dim=0) ### split latents by 16 to avoid odd purple output
+            image_segs = [self.vae.decode(latents_seg, return_dict=False)[0] for latents_seg in latents_segs]
+            image = torch.cat(image_segs, dim=0)
+            if transparent_decoder is not None:
+                transparent_decoder = transparent_decoder.to(dtype=image.dtype, device=image.device)
+                decoded_fg, decoded_alpha = transparent_decoder(latents, [validation_box])
+                decoded_alpha = (decoded_alpha + 1.0) / 2.0
+                decoded_alpha = torch.clamp(decoded_alpha, min=0.0, max=1.0).permute(0, 2, 3, 1)
+                decoded_fg = (decoded_fg + 1.0) / 2.0
+                decoded_fg = torch.clamp(decoded_fg, min=0.0, max=1.0).permute(0, 2, 3, 1)
+                vis_list = None
+                png = torch.cat([decoded_fg, decoded_alpha], dim=3)
+                result_list = (png * 255.0).detach().cpu().float().numpy().clip(0, 255).astype(np.uint8)
+            else:
+                result_list, vis_list = None, None
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, result_list, vis_list, latents)
+        return FluxPipelineOutput(images=image), result_list, vis_list, latents

modeling_crello.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+from transformers import PreTrainedModel, PretrainedConfig, AutoModel, AutoModelForCausalLM, OPTForCausalLM
+# from transformers import BitsAndBytesConfig
+from torch import nn
+import os
+from typing import Optional, List
+import os
+def kmp_preprocess(pattern):
+    pattern_len = len(pattern)
+    prefix_suffix = [0] * pattern_len
+    j = 0
+    for i in range(1, pattern_len):
+        while j > 0 and pattern[i] != pattern[j]:
+            j = prefix_suffix[j - 1]
+        if pattern[i] == pattern[j]:
+            j += 1
+        prefix_suffix[i] = j
+    return prefix_suffix
+def kmp_search(text, pattern):
+    text_len = len(text)
+    pattern_len = len(pattern)
+    prefix_suffix = kmp_preprocess(pattern)
+    matches = []
+    j = 0
+    for i in range(text_len):
+        while j > 0 and text[i] != pattern[j]:
+            j = prefix_suffix[j - 1]
+        if text[i] == pattern[j]:
+            j += 1
+        if j == pattern_len:
+            matches.append(i - j + 1)
+            j = prefix_suffix[j - 1]
+    return matches
+class ModelWrapper:
+  def __init__(self, model):
+    self.model = model
+  def __getattr__(self, name):
+    return getattr(self.model, name)
+  @torch.no_grad()
+  def __call__(self, pixel_values):
+    return self.model(pixel_values)
+  def eval(self):
+    pass
+  def train(self):
+    pass
+  def parameters(self):
+    return self.model.parameters()
+class CrelloModelConfig(PretrainedConfig):
+    def __init__(
+        self,
+        old_vocab_size: int = 32000,
+        vocab_size: int = 32000,
+        pad_token_id: int = 2,
+        ignore_ids: List[int] = [],
+        freeze_lm: bool = True, # lm.eval()
+        opt_version: str = 'facebook/opt-6.7b',
+        task: str = 'captioning',
+        use_lora: bool = False,
+        lora_alpha: int = 32,
+        lora_r: int = 8,
+        lora_dropout: float = 0.05,
+        lora_target_modules: str = r'.*\.(q_proj|v_proj)',
+        hidden_size: int = -1,
+        load_in_4bit: Optional[bool] = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        assert old_vocab_size > 0, 'old_vocab_size must be positive'
+        assert vocab_size > 0, 'vocab_size must be positive'
+        self.old_vocab_size = old_vocab_size
+        self.vocab_size = vocab_size
+        self.pad_token_id = pad_token_id
+        self.freeze_lm = freeze_lm
+        self.opt_version = opt_version
+        self.task = task
+        self.use_lora = use_lora
+        self.lora_alpha = lora_alpha
+        self.lora_r = lora_r
+        self.lora_dropout = lora_dropout
+        self.lora_target_modules = lora_target_modules
+        self.hidden_size = hidden_size
+        self.load_in_4bit = load_in_4bit
+        self.ignore_ids = ignore_ids
+class CrelloModel(PreTrainedModel):
+  config_class = CrelloModelConfig
+  supports_gradient_checkpointing = True
+  def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+    self.lm.gradient_checkpointing_enable()
+  def __init__(self, config: CrelloModelConfig): # 显示声明config类型
+    super().__init__(config)
+    self.pad_token_id = config.pad_token_id
+    self.args = config
+    opt_version = "WYBar/LLM_For_Layout_Planning"
+    print(f"Using {opt_version} for the language model.")
+    if 'facebook/opt' in opt_version:
+      self.lm = OPTForCausalLM.from_pretrained(opt_version)
+      word_embed_proj_dim = self.lm.config.word_embed_proj_dim
+    else:
+      if config.load_in_4bit:
+        print("\n would load_in_4bit")
+        quantization_config = None
+        # This means: fit the entire model on the GPU:0
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        device_map = {"": local_rank}
+        torch_dtype = torch.bfloat16
+      else:
+        print("\n wouldn't load_in_4bit")
+        quantization_config = None
+        device_map = None
+        torch_dtype = None
+      self.lm = AutoModelForCausalLM.from_pretrained(
+        "WYBar/LLM_For_Layout_Planning",
+        subfolder="Meta-Llama-3-8B",
+        # use_auth_token=use_auth_token,
+        # quantization_config=quantization_config,
+        # device_map=device_map,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        # cache_dir="/openseg_blob/v-yanbin/GradioDemo/cache_dir",
+      )
+      word_embed_proj_dim = self.lm.config.hidden_size
+    self.config.hidden_size = self.lm.config.hidden_size
+    self.opt_version = opt_version
+    if self.args.freeze_lm:
+      self.lm.eval()
+      print("Freezing the LM.")
+      # for param in self.lm.parameters():
+      #   param.requires_grad = False
+    else:
+      print("\n no freeze lm, so to train lm")
+      self.lm.train()
+      self.lm.config.gradient_checkpointing = True
+    # print('resize token embeddings to match the tokenizer', config.vocab_size)
+    # self.lm.resize_token_embeddings(config.vocab_size)
+    # self.input_embeddings = self.lm.get_input_embeddings()
+    # print('after token embeddings to match the tokenizer', config.vocab_size)
+  def train(self, mode=True):
+    super().train(mode=mode)
+    # Overwrite train() to ensure frozen models remain frozen.
+    if self.args.freeze_lm:
+      self.lm.eval()
+  def forward(
+    self,
+    labels: torch.LongTensor,
+  ):
+    batch_size = labels.shape[0]
+    full_labels = labels.detach().clone()
+    input_embs = self.input_embeddings(labels)  # (N, T, D)
+    input_embs_norm = ((input_embs ** 2).sum(dim=-1) ** 0.5).mean()
+    for ignore_id in self.config.ignore_ids:
+      full_labels[full_labels == ignore_id] = -100
+    pad_idx = []
+    # 获取每一个batch的 seq 长度，取值为 max_len or padding_position，记录在pad_idx
+    # -100 is the ignore index for cross entropy loss. https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
+    for label in full_labels:
+      for k, token in enumerate(label):
+        # Mask out pad tokens if they exist.
+        if token in [self.pad_token_id]:
+          label[k:] = -100 # 将后面的token都mask掉
+          pad_idx.append(k)
+          break
+        if k == len(label) - 1:  # No padding found.
+          pad_idx.append(k + 1)
+    assert len(pad_idx) == batch_size, (len(pad_idx), batch_size)
+    output = self.lm( inputs_embeds=input_embs,
+                      # input_ids=labels,
+                      labels=full_labels,
+                      output_hidden_states=True)
+    return output, full_labels, input_embs_norm
+if __name__=="__main__":
+  config = CrelloModelConfig(
+    vocab_size=50265,
+    image_reg_token=50264,
+    image_gt_token=50263,
+  )
+  print("config: ",config)
+  model1 = CrelloModel(config)
+  print("\nmodel1: ",model1)
+  model1.save_pretrained('test')
+  model2 = CrelloModel.from_pretrained('test')
+  print("\nmodel2: ",model2)
+  # compare model1 and model2
+  state_dict1 = model1.state_dict()
+  state_dict2 = model2.state_dict()
+  assert set(state_dict1.keys()) == set(state_dict2.keys())
+  for k in state_dict1.keys():
+    assert torch.equal(state_dict1[k], state_dict2[k])
+  print('all parameters are equal')

quantizer.py ADDED Viewed

	@@ -0,0 +1,552 @@

+import torch
+import numpy as np
+import copy
+from collections import OrderedDict
+import json
+from datasets import ClassLabel
+import random
+import math
+from functools import lru_cache
+from matplotlib import font_manager
+from colorama import Fore, Style, init
+class BaseQuantizer:
+    @property
+    def ignore_tokens(self):
+        if self.num_mask_tokens > 0:
+            if self.mask_type == 'cm3':
+                return [self.predict_start_token] + self.mask_tokens
+            elif self.mask_type == 'mask_aug':
+                return [self.mask_aug_token]
+            else:
+                raise ValueError(f'Invalid mask type {self.mask_type}')
+        else:
+            return []
+    def __init__(self, simplify_json=False, mask_all=False,
+                    num_mask_tokens=0, mask_type='cm3', **kwargs):
+        self.simplify_json=simplify_json
+        self.io_ignore_replace_tokens = ['<split-text>']
+        self.mask_all = mask_all
+        self.num_mask_tokens = num_mask_tokens
+        self.mask_type = mask_type
+        if self.mask_type == 'mask_aug':
+            self.mask_aug_token = '<mask-aug>'
+        elif self.mask_type == 'cm3':
+            self.predict_start_token = '<pred-start>'
+        else:
+            raise ValueError(f'Invalid mask type {self.mask_type}')
+    def get_additional_mask_tokens(self):
+        if self.mask_type == 'cm3': # 两种配置：1. ['<pred-start>'] + '<mask-%d>'，数量和self.num_mask_tokens相关 2. ['<mask-aug>']
+            self.mask_tokens = ['<mask-%d>' % i for i in range(self.num_mask_tokens)]
+            return [self.predict_start_token] + self.mask_tokens
+        elif self.mask_type == 'mask_aug':
+            return [self.mask_aug_token]
+        else:
+            raise ValueError(f'Invalid mask type {self.mask_type}')
+    def dump2json(self, json_example):
+        if self.simplify_json: # 将 dict 转化为 str， 如果simplify_json is True，那么缩减空格和换行，删除token的双引号
+            content = json.dumps(json_example, separators=(',',':'))
+            for token in self.additional_special_tokens:
+                content = content.replace(f'"{token}"', token)
+        else:
+            content = json.dumps(json_example)
+        return content
+    def load_json(self, content): # 将str转化为json
+        replace_tokens = set(self.additional_special_tokens) - set(self.io_ignore_replace_tokens) # sirui change
+        if self.simplify_json:
+            for token in replace_tokens: # 如果simplify_json is True，那么为 token 添加双引号
+                content = content.replace(token, f'"{token}"')
+        return json.loads(content)
+    def apply_masking(self,
+                  json_example,
+                  mask_all=None,
+                  return_meta=False,
+                  target_keys=['width', 'height', 'left', 'top'],
+                  target_element_types=None
+                  ):
+        if mask_all is None:
+            mask_all = self.mask_all
+        json_example = copy.deepcopy(json_example)
+        target_keys = set(target_keys)
+        target_tokens = []
+        for shape_i, shape in enumerate(json_example['layers']['textlayer']):
+            # element_type = self.general_dequantize(shape['type'],'type',to_float=False)
+            # if target_element_types is not None:
+                # if element_type not in target_element_types:
+                    # continue
+            for key_i, key in enumerate(shape.keys()):
+                if key in target_keys:
+                    target_tokens.append((shape_i, key_i, key, shape[key]))
+        if not mask_all:
+            target_num_mask_tokens = random.randint(1, self.num_mask_tokens)
+            if len(target_tokens) > target_num_mask_tokens:
+                random.shuffle(target_tokens)
+                target_tokens = target_tokens[:target_num_mask_tokens]
+                # sort by shape_i and key_i
+                target_tokens = sorted(target_tokens, key=lambda x: x[0]*100+x[1])
+        else:
+            if len(target_tokens) > self.num_mask_tokens:
+                # 取最后面几个
+                target_tokens = target_tokens[-self.num_mask_tokens:]
+        tuples = []
+        meta_infos = []
+        for mask_i, (shape_i, key_i, key, value) in enumerate(target_tokens):
+            if self.mask_type == 'cm3':
+                mask_token = self.mask_tokens[mask_i]
+            elif self.mask_type == 'mask_aug':
+                mask_token = self.mask_aug_token
+            else:
+                raise ValueError(f'Invalid mask type {self.mask_type}')
+            # <one-1><decimal0-1><decimal1-2>
+            if '<' in value:
+                num_token = value.count('<')
+            else:
+                num_token = value.count(' ')
+            json_example['layers']['textlayer'][shape_i][key] = mask_token
+            tuples.append((mask_token, value, num_token))
+            meta_infos.append((shape_i,key))
+        if return_meta:
+            return json_example, tuples, meta_infos
+        else:
+            return json_example, tuples
+    def make_prediction_postfix(self, tuples):
+        postfix = self.predict_start_token
+        for mask_token, value, num_token in tuples:
+            postfix = postfix+ f'{mask_token}{value}'
+        return postfix
+# specs={
+#     "width":"size",
+#     "height":"size",
+#     "left":"pos",
+#     "top":"pos",
+#     "x":"pos", # center x
+#     "y":"pos", # center y
+#     "opacity":"opacity",
+#     "color":"color",
+#     "angle":"angle",
+#     "font_size":"font_size",
+#     'ratio':'ratio',
+#     'letter_spacing': 'spacing',
+#     'textlen': 'textlen'
+# }
+specs={
+    "width":"size",
+    "height":"size",
+    "x":"pos", # center x
+    "y":"pos", # center y
+    "color":"color",
+    "font":"font"
+}
+# TODO change min_max_bins
+# min_max_bins = {
+#     'size':(0,2,256),
+#     'pos':(-1,1,256),
+#     # 'opacity':(0,1,8),
+#     'opacity':(0,255,8),
+#     'color':(0,255,32),
+#     'angle':(0,2*np.pi,64),
+#     'font_size':(2,200,100),
+#     'spacing': (0,1,40),
+#     'textlen': (1,20,20)
+# }
+min_max_bins = {
+    'size': (0,1,256),
+    'pos': (0,1,256),
+    'color': (0,137,138),
+    'font': (0,511,512)
+}
+import numpy as np
+# pre 和 post 分别代表 10 的幂，分别对应大数和小数部分，参数代表位数
+def get_keys_and_multipliers(pre_decimal=3, post_decimal=2):
+    pre_keys = ['one', 'ten', 'hundred', 'thousand']
+    pre_multiplers = [1, 10, 100, 1000]
+    assert pre_decimal <= len(pre_keys)
+    pre_keys = pre_keys[:pre_decimal][::-1]
+    pre_multiplers = pre_multiplers[:pre_decimal][::-1]
+    post_keys = [f'decimal{x}' for x in range(post_decimal)]
+    post_multiplers = [10 ** -(x+1) for x in range(post_decimal)]
+    keys =  pre_keys + post_keys
+    multiplers = pre_multiplers + post_multiplers
+    return keys, multiplers
+class DecimalQuantizer:
+    def __init__(self, max_pre_decimal=3, max_post_decimal=2):
+        self.max_pre_decimal = max_pre_decimal
+        self.max_post_decimal = max_post_decimal
+        self.keys, self.multiplers = get_keys_and_multipliers(max_pre_decimal, max_post_decimal)
+        self.symbols = {
+            -1: '<symbol-1>',
+            1: '<symbol-0>',
+        }
+    def get_vocab(self):
+        special_tokens = [*self.symbols.values()]   # ['<symbol-1>', '<symbol-0>']
+        for key in self.keys:  # ['one', 'ten', 'hundred', 'thousand'] + ['decimal0', 'decimal1]
+            special_tokens.extend([f'<{key}-{i}>' for i in range(10)])
+        return special_tokens
+    def check_valid(self, token):
+        prefix = token.lstrip('<').split('-')[0] # '<symbol-1>' -> 'symbol-1>' -> ['symbol', '1>']
+        if prefix =='symbol' or prefix in self.keys:
+            return True
+        else:
+            return False
+    # 小数点后保留两位
+    def __call__(self, val, pre_decimal=None, post_decimal=None, need_symbol=False): # 100.00
+        if pre_decimal is None:
+            pre_decimal = self.max_pre_decimal
+        if post_decimal is None:
+            post_decimal = self.max_post_decimal
+        assert pre_decimal <= self.max_pre_decimal
+        assert post_decimal <= self.max_post_decimal
+        keys, multiplers = get_keys_and_multipliers(pre_decimal, post_decimal)
+        symbol = int(np.sign(val)) # 返回一个浮点数（1.0, -1.0 或 0.0），代表正负和0
+        if symbol == 0: # 两类：>= 0  &  < 0
+            symbol = 1
+        val = round(abs(val), post_decimal) # 将 val 的绝对值四舍五入到 post_decimal 位小数
+        tokens = []
+        if need_symbol: # self.symbols = {-1: '<symbol-1>', 1: '<symbol-0>',}
+            symbol_type = self.symbols[symbol]
+            tokens.append(symbol_type)
+        else:
+            assert symbol >= 0
+        for key, multipler in zip(keys, multiplers):
+            # 用于获取对于给定数值 val，每一位的数字，并且生成为'<one-7>'这样的token
+            v = math.floor(val / multipler)
+            if v > 9:
+                raise ValueError(f'Invalid value {val} for {pre_decimal} pre_decimal and {post_decimal} post_decimal')
+            val = val - v * multipler
+            tokens.append(f'<{key}-{v}>')
+        # 对于val，生成每一位数字对应的token，如果need_symbol = True，还会在前面加上 标识 >= 0 和 < 0 的 symbol-1 和 symbol-0
+        return ''.join(tokens)
+    def parse_token(self, token):
+        # <hundred-1> -> hundred, 1
+        key, val = token[1:-1].split('-')
+        return key, int(val)
+    def decode(self, tokens_str): # 将token_str用 > 先拆开，再添上 > ，然后转化为 list
+        tokens = tokens_str.split('>')
+        tokens = [x+'>' for x in tokens if x != '']
+        if tokens[0].startswith('<symbol'):
+            symbol_type = tokens[0]
+            tokens = tokens[1:]
+            inv_map = {v: k for k, v in self.symbols.items()} # 和 原字典 键、值 对调
+            symbol = inv_map[symbol_type]
+        else:
+            symbol = 1
+        accumulater = 0
+        for token in tokens:
+            key, val = self.parse_token(token)
+            multipler_index = self.keys.index(key)
+            multipler = self.multiplers[multipler_index]
+            actual_val = val * multipler
+            # print(key, val, multipler, actual_val)
+            accumulater += actual_val
+        accumulater = accumulater * symbol
+        # 还原出原来的整数，带有符号，并且精度 由 pre/post_decimal位数控制
+        return accumulater
+# min_max_bins = {
+#     'size': (0,1,256),
+#     'pos': (0,1,256),
+#     'color': (0,137,138),
+#     'font': (0,511,512)
+# }
+pre_post_decimals={
+    'size': {
+        'pre_decimal': 1,
+        'post_decimal': 2,
+        'need_symbol': False
+    },
+    'pos': {
+        'pre_decimal': 1,
+        'post_decimal': 2,
+        'need_symbol': True
+    },
+    'opacity': {
+        'pre_decimal': 1,
+        'post_decimal': 1,
+        'need_symbol': False
+    },
+    'color':{
+        'pre_decimal': 3,
+        'post_decimal': 0,
+        'need_symbol': False
+    },
+    'angle':{
+        'pre_decimal': 1,
+        'post_decimal': 2,
+        'need_symbol': False
+    },
+    'font_size':{
+        'pre_decimal': 3,
+        'post_decimal': 0,
+        'need_symbol': False
+    },
+}
+class QuantizerV4(BaseQuantizer):
+    def __init__(self, quant=True,
+                 decimal_quantize_types = [],
+                 decimal_quantize_kwargs = {'max_pre_decimal':3, 'max_post_decimal':2},
+                 mask_values=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.min = min
+        self.max = max
+        self.quant = quant
+        self.mask_values = mask_values
+        self.text_split_token = '<split-text>'
+        self.decimal_quantize_types = decimal_quantize_types
+        self.decimal_quantize = len(decimal_quantize_types) > 0
+        if len(decimal_quantize_types) > 0:
+            print('decimal quantize types', decimal_quantize_types)
+            self.decimal_quantizer = DecimalQuantizer(**decimal_quantize_kwargs)
+        else:
+            self.decimal_quantizer = None
+        self.set_min_max_bins(min_max_bins)
+        # min_max_bins = {
+        #     'size': (0,1,256),
+        #     'pos': (0,1,256),
+        #     'color': (0,137,138),
+        #     'font': (0,511,512)
+        # }
+        self.width = kwargs.get('width', 1456)
+        self.height = kwargs.get('height', 1457)
+        self.width = int(self.width)
+        self.height = int(self.height)
+    def set_min_max_bins(self, min_max_bins): # 检查 n_bins是否是偶数，然后将其 +1
+        min_max_bins = copy.deepcopy(min_max_bins)
+        # adjust the bins to plus one
+        for type_name, (min_val, max_val, n_bins) in min_max_bins.items():
+            assert n_bins % 2 == 0 # must be even
+            min_max_bins[type_name] = (min_val, max_val, n_bins+1)
+        self.min_max_bins = min_max_bins
+    def setup_tokenizer(self, tokenizer):
+        # 整个函数生成additional_special_tokens：1. '<split-text>' 2.<one-1> <symbol-1> : decimal quantizer 3. <size-255> quantizerV4 4.self.get_additional_mask_tokens()
+        # 然后tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})
+        additional_special_tokens = [self.text_split_token]  # self.text_split_token = '<split-text>'
+        if self.decimal_quantize:
+            special_tokens = self.decimal_quantizer.get_vocab() # <one-1> <symbol-1>
+            self.io_ignore_replace_tokens += special_tokens # self.io_ignore_replace_tokens = ['<split-text>'] 在BaseQuantizer中声明
+            additional_special_tokens += special_tokens
+        # the order must be preserved, other wise the tokenizer will be wrong
+        rest_types = [key for key in self.min_max_bins.keys() if key not in self.decimal_quantize_types]
+        for type_name in rest_types:
+            min_val, max_val, n_bins = self.min_max_bins[type_name]
+            additional_special_tokens += [f'<{type_name}-{i}>' for i in range(n_bins)] # <size-256>
+        if self.num_mask_tokens > 0:
+            additional_special_tokens.extend(self.get_additional_mask_tokens())
+        print('additional_special_tokens', additional_special_tokens)
+        tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})
+        self.additional_special_tokens = set(additional_special_tokens)
+        return tokenizer
+    @lru_cache(maxsize=128) # 缓存函数的返回值，以提高性能。maxsize=128 表示缓存最多存储 128 个不同的输入结果
+    def get_bins(self, real_type): # real_type: size, pos, font, color
+        # 返回 最小值，最大值，等距数组
+        min_val, max_val, n_bins = self.min_max_bins[real_type]
+        return min_val, max_val, np.linspace(min_val, max_val, n_bins)
+    def quantize(self, x, type): # (0.25, 'y') -> (<size-50>)
+        if not self.quant:
+            return x
+        """Quantize a float array x into n_bins discrete values."""
+        real_type = specs[type] # x, y, width, height, color, font -> size, pos, font, color
+        min_val, max_val, bins = self.get_bins(real_type)
+        x = np.clip(float(x), min_val, max_val) # 确保 x 的值在 [min_val, max_val] 范围内，否则截断
+        if self.decimal_quantize and real_type in self.decimal_quantize_types:
+            return self.decimal_quantizer(x, **pre_post_decimals[real_type])
+        val = np.digitize(x, bins) - 1  # val是一个整数，取值范围在[0, len(bins)]，换句话说就是bins数组的索引
+        n_bins = len(bins)
+        assert val >= 0 and val < n_bins
+        return f'<{real_type}-{val}>' # <size-255>
+    def dequantize(self, x): # （<size-255> -> 0.99?)
+        # <pos-1>->1
+        val = x.split('-')[1].strip('>')
+        # <pos-1>->pos
+        real_type = x.split('-')[0][1:]
+        if self.decimal_quantize and self.decimal_quantizer.check_valid(x):
+            return self.decimal_quantizer.decode(x)
+        min_val, max_val, bins = self.get_bins(real_type)
+        return bins[int(val)]
+    def construct_map_dict(self):
+        map_dict = {}
+        for i in range(self.min_max_bins['size'][2]): # 'size': (0, 1, 256),
+            name = "<size-%d>" % i
+            value = self.dequantize(name)
+            map_dict[name] = str(value) # 255 -> 0.99?
+        for i in range(self.min_max_bins['pos'][2]):
+            name = "<pos-%d>" % i
+            value = self.dequantize(name)
+            map_dict[name] = str(value)
+        return map_dict
+    def postprocess_colorandfont(self, json_example):
+        # 将其中的 正则 匹配部分 用双引号包裹
+        import re
+        json_example = re.sub(r'(<font-\d+>)', r'"\1"', json_example)
+        json_example = re.sub(r'(<color-\d+>)', r'"\1"', json_example)
+        return json_example
+    def to_str(self, x, type):
+        feature = self.get_feature(type)
+        return feature.int2str(x)
+    def convert2layout(self, example): # 将原始的数据转化为 <size-255> 的 token形式
+        new_example = OrderedDict()
+        new_example['wholecaption'] = example['wholecaption']
+        new_layout = []
+        for meta_layer in example['layout']:
+            new_layout.append({
+                "layer": meta_layer["layer"],
+                "x": self.quantize(meta_layer["x"]/self.width, 'x'),
+                "y": self.quantize(meta_layer["y"]/self.height, 'y'),
+                "width": self.quantize(meta_layer["width"]/self.width, 'width'),
+                "height": self.quantize(meta_layer["height"]/self.height, 'height')
+            })
+        new_example['layout'] = new_layout
+        return new_example
+    def apply_masking(self,
+                  json_example,
+                  mask_all=None,
+                  return_meta=False,
+                  # target_keys=['width', 'height', 'left', 'top'], # useless
+                  # target_element_types=None, # useless
+                  mask_values = True
+                  ):
+        if mask_all is None:
+            mask_all = self.mask_all
+        json_example = copy.deepcopy(json_example)
+        # 这段内容对json中的一些 value 替换为 <mask-i>，并用self.num_mask_tokens限制mask的数量，根据参数还可能进行随机mask
+        # 并记录 <mask-i> & value & num_token = value.count('<') 的 三元tuple
+        target_tokens = []
+        if self.mask_values and mask_values:
+            target_tokens.append((-1,-1,'globalcaption', json_example['globalcaption']))
+            target_tokens.append((-1,-1,'canvas_width', json_example['canvas_width']))
+            target_tokens.append((-1,-1,'canvas_height', json_example['canvas_height']))
+            target_tokens.append((-1,-1,'category', json_example['category']))
+            target_tokens.append((-1,-1,'keywords', json_example['keywords']))
+            target_tokens.append((-1,-1,'bgcaption', json_example['layers']['bglayer']['bgcaption']))
+            target_tokens.append((-1,-1,'flag', json_example['layers']['objlayer']['flag']))
+            target_tokens.append((-1,-1,'objcaption', json_example['layers']['objlayer']['objcaption']))
+        for layer_i, textlayer in enumerate(json_example['layers']['textlayer']):
+            target_tokens.append((layer_i, -1, 'text', json_example['layers']['textlayer'][textlayer]))
+        if not mask_all: # 随机取值 target_num_mask_tokens, 上界是self.num_mask_tokens
+            target_num_mask_tokens = random.randint(1, self.num_mask_tokens)
+            if len(target_tokens) > target_num_mask_tokens:
+                random.shuffle(target_tokens)
+                target_tokens = target_tokens[:target_num_mask_tokens]
+                # sort by shape_i and key_i
+                target_tokens = sorted(target_tokens, key=lambda x: x[0]*100+x[1])
+        else: # 取定值 num_mask_tokens
+            if len(target_tokens) > self.num_mask_tokens:
+                # 取最后面几个
+                target_tokens = target_tokens[-self.num_mask_tokens:]
+        tuples = []
+        meta_infos = []
+        layer_list = ['heading', 'subheading', 'body']
+        for mask_i, (shape_i, key_i, key, value) in enumerate(target_tokens):
+            if self.mask_type == 'cm3':
+                mask_token = self.mask_tokens[mask_i]
+            elif self.mask_type == 'mask_aug':
+                mask_token = self.mask_aug_token
+            else:
+                raise ValueError(f'Invalid mask type {self.mask_type}')
+            # <one-1><decimal0-1><decimal1-2>
+            if '<' in value:
+                num_token = value.count('<')
+            else:
+                num_token = value.count(' ') + 1
+            if shape_i == -1:
+                if key in ['bgcaption']:
+                    json_example['layers']['bglayer']['bgcaption'] = mask_token
+                elif key in ['objcaption']:
+                    json_example['layers']['objlayer']['objcaption'] = mask_token
+                elif key in ['flag']:
+                    json_example['layers']['objlayer']['flag'] = mask_token
+                else:
+                    json_example[key] = mask_token
+            else:
+                curlayer = layer_list[shape_i]
+                json_example['layers']['textlayer'][curlayer] = mask_token
+            tuples.append((mask_token, value, num_token))
+            meta_infos.append((shape_i,key))
+        if return_meta:
+            return json_example, tuples, meta_infos
+        else:
+            return json_example, tuples
+# useless orginally used for render
+def is_font_exists(font_name):
+    font_list = font_manager.findSystemFonts()
+    # print("\nfont_list: ",font_list)
+    for font in font_list:
+        if font_name.lower() in font.lower():
+            return True
+    return False
+def print_info(msg):
+    print(Fore.GREEN + "[INFO] " + msg)
+def print_warning(msg):
+    print(Fore.YELLOW + "[WARNING] " + msg)
+def print_error(msg):
+    print(Fore.RED + "[ERROR] " + msg)
+def load_feature(path):
+    with open(path) as f:
+        content = f.read()
+    content = json.loads(content)
+    names = [content[str(i)] for i in range(len(content))]
+    return ClassLabel(num_classes= len(names), names=names)
+def get_quantizer(version='v1', update_vocab=False, **kwargs):
+    """ if kwargs.pop('separate_alpha', False):  # useless
+        kwargs['n_visual_tokens'] *= 2 """
+    if version == 'v4':
+        quantizer = QuantizerV4(**kwargs)
+    else:
+        raise NotImplementedError
+    return quantizer

requirements.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# 核心框架
+torch==2.4.0                   # 保持与conda环境一致（pypi安装）
+torchvision==0.19.0            # 保持与conda环境一致（pypi安装）
+# Hugging Face 生态
+transformers==4.44.0           # 原4.39.1 → conda实际安装4.44.0
+diffusers==0.31.0              # 保持与conda环境一致
+accelerate==0.34.2             # 原0.27.2 → conda实际安装0.34.2
+peft==0.12.0                   # 原git提交 → conda实际安装0.12.0（pypi）
+datasets==2.20.0               # 保持与conda环境一致
+# 工具链
+deepspeed==0.15.4              # 原0.14.2 → conda实际安装0.15.4
+# bitsandbytes==0.44.1           # 原0.43.0 → conda实际安装0.44.1
+protobuf==3.20.0               # 原3.20.3 → conda实际安装3.20.0（需验证tensorboard兼容性）
+tensorboard==2.18.0            # 新增明确版本（conda实际安装2.18.0）
+tensorboardx==2.6.2.2          # 新增明确版本（conda实际安装2.6.2.2）
+webdataset==0.2.100            # 新增明确版本（conda实际安装0.2.100）
+# 训练辅助
+warmup_scheduler==0.3          # 新增明确版本（conda实际安装0.3）
+torchmetrics==1.6.0            # 新增明确版本（conda实际安装1.6.0）
+open_clip_torch==2.29.0        # 新增明确版本（conda实际安装2.29.0）
+evaluate==0.4.3                # 新增明确版本（conda实际安装0.4.3）
+bert_score==0.3.13             # 新增明确版本（conda实际安装0.3.13）
+einops==0.8.0                  # 保持与conda环境一致
+wandb==0.17.7                  # 保持与conda环境一致
+# 图像处理
+matplotlib==3.9.2              # 新增明确版本（conda实际安装3.9.2）
+opencv-python==4.10.0.84       # 新增明确版本（conda实际安装4.10.0.84）
+clean-fid==0.1.35              # 新增明确版本（conda实际安装0.1.35）
+skia-python==87.6              # 新增明确版本（conda实际安装87.6）
+# 部署与接口
+# gradio==5.5.0                  # 新增明确版本（conda实际安装5.5.0）
+langchain>=0.0.139              # 保持约束（conda实际安装0.3.7符合要求）
+tiktoken==0.8.0                # 新增明确版本（conda实际安装0.8.0）
+# 系统工具
+ninja==1.11.1.1                # 新增明确版本（conda实际安装1.11.1.1）
+pynvml==11.5.3                 # 新增明确版本（conda实际安装11.5.3）
+colorama==0.4.6                # 新增明确版本（conda实际安装0.4.6）
+click>=8.0.4,<9                # 保持约束（conda实际安装8.1.7符合要求）\
+sentencepiece