Spaces:

Gen-Verse
/

MMaDA

Running on Zero

App Files Files Community

YucYux commited on 17 days ago

Commit

60e176e

1 Parent(s): d08f144

Added support for MMaDA-8B-MixCoT

Browse files

Files changed (1) hide show

app.py +290 -71

app.py CHANGED Viewed

@@ -47,22 +47,23 @@ def get_num_transfer_tokens(mask_index, steps):
     return num_transfer_tokens
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-DEFAULT_MODEL_PATH = "Gen-Verse/MMaDA-8B-Base" # Default
 MASK_ID = 126336
 MODEL = MMadaModelLM.from_pretrained(DEFAULT_MODEL_PATH, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
 TOKENIZER = AutoTokenizer.from_pretrained(DEFAULT_MODEL_PATH, trust_remote_code=True)
 uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
 VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2").to(DEVICE)
-CURRENT_MODEL_PATH = None
 MODEL_CHOICES = [
     "MMaDA-8B-Base",
-    "MMaDA-8B-MixCoT (coming soon)",
     "MMaDA-8B-Max (coming soon)"
 ]
 MODEL_ACTUAL_PATHS = {
-    "MMaDA-8B-Base": DEFAULT_MODEL_PATH,
 }
 def clear_outputs_action():
@@ -116,19 +117,91 @@ def _load_model_and_tokenizer_core(model_path_to_load, model_display_name_for_st
     #     return f"Error loading model '{model_display_name_for_status}': {str(e)}"
 def handle_model_selection_change(selected_model_name_ui):
-    if "coming soon" in selected_model_name_ui.lower():
-        global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH
         MODEL = None
         TOKENIZER = None
         MASK_ID = None
         CURRENT_MODEL_PATH = None
-        return f"'{selected_model_name_ui}' is not yet available. Please select 'Model A'."
-    actual_path = MODEL_ACTUAL_PATHS.get(selected_model_name_ui)
-    if not actual_path:
-        return f"Path for '{selected_model_name_ui}' is not defined. Cannot load."
-    return _load_model_and_tokenizer_core(actual_path, selected_model_name_ui)
 def get_highlighted_text_tuples(current_x_ids_batch, prompt_input_ids, prompt_len, tk, current_mask_id, raw_prompt_attention_mask):
@@ -618,7 +691,7 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
         model_select_radio = gr.Radio(
             label="Select Text Generation Model",
             choices=MODEL_CHOICES,
-            value=MODEL_CHOICES[0]
         )
         model_load_status_box = gr.Textbox(
             label="Model Load Status",
@@ -662,17 +735,39 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
             output_final_text_box_lm = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
-    gr.Examples(
-        examples=[
-            ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
-            ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
-        ],
-        inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
-        outputs=[output_visualization_box_lm, output_final_text_box_lm],
-        fn=generate_viz_wrapper_lm,
-        cache_examples=False
-    )
     gr.Markdown("---")
     gr.Markdown("## Part 2. Multimodal Understanding")
@@ -681,7 +776,7 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
             prompt_input_box_mmu = gr.Textbox(
                 label="Enter your prompt:",
                 lines=3,
-                value="Please describe this image in detail."
             )
             think_button_mmu = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
             with gr.Accordion("Generation Parameters", open=True):
@@ -689,7 +784,7 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
                     gen_length_slider_mmu = gr.Slider(minimum=64, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
                     steps_slider_mmu = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
                 with gr.Row():
-                    block_length_slider_mmu = gr.Slider(minimum=32, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
                     remasking_dropdown_mmu = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
                 with gr.Row():
                     cfg_scale_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
@@ -715,44 +810,120 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
             gr.Markdown("## Final Generated Text")
             output_final_text_box_mmu = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
-    gr.Examples(
-        examples=[
-            [
-                "figs/sunflower.jpg",
-                "Please describe this image in detail.",
-                256,
-                512,
-                128,
-                1,
-                0,
-                "low_confidence"
             ],
-            [
-                "figs/woman.jpg",
-                "Please describe this image in detail.",
-                256,
-                512,
-                128,
-                1,
-                0,
-                "low_confidence"
-            ]
-        ],
-        inputs=[
-            image_upload_box,
-            prompt_input_box_mmu,
-            steps_slider_mmu,
-            gen_length_slider_mmu,
-            block_length_slider_mmu,
-            temperature_slider_mmu,
-            cfg_scale_slider_mmu,
-            remasking_dropdown_mmu
-        ],
-        outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
-        fn=generate_viz_wrapper,
-        cache_examples=False
-    )
     gr.Markdown("---")
     gr.Markdown("## Part 3. Text-to-Image Generation")
@@ -823,21 +994,69 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
         inputs=[thinking_mode_mmu],
         outputs=[thinking_mode_mmu, think_button_mmu]
     )
-    def initialize_default_model():
-        default_model = "MMaDA-8B-Base"
-        result = handle_model_selection_change(default_model)
-        return default_model, result
     demo.load(
-        fn=initialize_default_model,
         inputs=None,
-        outputs=[model_select_radio, model_load_status_box],
         queue=True
     )
     def clear_outputs():
         return None, None, None  # Clear image, visualization, and final text

     return num_transfer_tokens
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+DEFAULT_MODEL_PATH = "Gen-Verse/MMaDA-8B-MixCoT" # Default
 MASK_ID = 126336
 MODEL = MMadaModelLM.from_pretrained(DEFAULT_MODEL_PATH, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
 TOKENIZER = AutoTokenizer.from_pretrained(DEFAULT_MODEL_PATH, trust_remote_code=True)
 uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
 VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2").to(DEVICE)
+CURRENT_MODEL_PATH = DEFAULT_MODEL_PATH
 MODEL_CHOICES = [
     "MMaDA-8B-Base",
+    "MMaDA-8B-MixCoT",
     "MMaDA-8B-Max (coming soon)"
 ]
 MODEL_ACTUAL_PATHS = {
+    "MMaDA-8B-Base": "Gen-Verse/MMaDA-8B-Base",
+    "MMaDA-8B-MixCoT": "Gen-Verse/MMaDA-8B-MixCoT"
 }
 def clear_outputs_action():
     #     return f"Error loading model '{model_display_name_for_status}': {str(e)}"
 def handle_model_selection_change(selected_model_name_ui):
+    global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH, DEVICE, uni_prompting
+    status_msg = ""
+    # 初始化 Examples 的可见性更新
+    vis_lm_base = gr.update(visible=False)
+    vis_lm_mixcot = gr.update(visible=False)
+    vis_lm_max = gr.update(visible=False)
+    vis_mmu_base = gr.update(visible=False)
+    vis_mmu_mixcot = gr.update(visible=False)
+    vis_mmu_max = gr.update(visible=False)
+    # 根据选择的模型决定 thinking mode 的默认状态
+    is_mixcot_model_selected = (selected_model_name_ui == "MMaDA-8B-MixCoT")
+    # 初始 thinking mode 状态和按钮标签
+    # 如果是 MixCoT 模型，则默认为 True (开启)
+    current_thinking_mode_lm_state = is_mixcot_model_selected
+    current_thinking_mode_mmu_state = is_mixcot_model_selected
+    lm_think_button_label = "Thinking Mode ✅" if current_thinking_mode_lm_state else "Thinking Mode ❌"
+    mmu_think_button_label = "Thinking Mode ✅" if current_thinking_mode_mmu_state else "Thinking Mode ❌"
+    update_think_button_lm = gr.update(value=lm_think_button_label)
+    update_think_button_mmu = gr.update(value=mmu_think_button_label)
+    if selected_model_name_ui == "MMaDA-8B-Max (coming soon)":
         MODEL = None
         TOKENIZER = None
         MASK_ID = None
         CURRENT_MODEL_PATH = None
+        status_msg = f"'{selected_model_name_ui}' is not yet available. Please select another model."
+        vis_lm_max = gr.update(visible=True)
+        vis_mmu_max = gr.update(visible=True)
+        # 对于非 MixCoT 模型，thinking mode 在上面已经根据 is_mixcot_model_selected 设置为 False
+    else:
+        actual_path = MODEL_ACTUAL_PATHS.get(selected_model_name_ui)
+        if not actual_path:
+            MODEL = None
+            TOKENIZER = None
+            MASK_ID = None
+            CURRENT_MODEL_PATH = None
+            status_msg = f"Path for '{selected_model_name_ui}' is not defined. Cannot load."
+            # 如果路径未定义（意味着不是有效的MixCoT加载），thinking mode应为False
+            if is_mixcot_model_selected: # 如果本应是MixCoT但路径没有
+                current_thinking_mode_lm_state = False
+                current_thinking_mode_mmu_state = False
+                update_think_button_lm = gr.update(value="Thinking Mode ❌")
+                update_think_button_mmu = gr.update(value="Thinking Mode ❌")
+        else:
+            # 尝试加载模型
+            status_msg = _load_model_and_tokenizer_core(actual_path, selected_model_name_ui)
+            # 检查模型是否成功加载
+            if "Error loading model" in status_msg or MODEL is None:
+                # 如果是 MixCoT 模型但加载失败，则关闭 thinking mode
+                if is_mixcot_model_selected:
+                    current_thinking_mode_lm_state = False
+                    current_thinking_mode_mmu_state = False
+                    update_think_button_lm = gr.update(value="Thinking Mode ❌")
+                    update_think_button_mmu = gr.update(value="Thinking Mode ❌")
+                if MODEL is None and "Error" not in status_msg: # 补充一个通用错误信息
+                    status_msg = f"Failed to properly load model '{selected_model_name_ui}'. {status_msg}"
+            else: # 模型成功加载
+                if selected_model_name_ui == "MMaDA-8B-Base":
+                    vis_lm_base = gr.update(visible=True)
+                    vis_mmu_base = gr.update(visible=True)
+                elif selected_model_name_ui == "MMaDA-8B-MixCoT":
+                    vis_lm_mixcot = gr.update(visible=True)
+                    vis_mmu_mixcot = gr.update(visible=True)
+                    # thinking mode 已经在函数开头根据 is_mixcot_model_selected 设置为 True
+    return (
+        status_msg,
+        vis_lm_base,
+        vis_lm_mixcot,
+        vis_lm_max,
+        vis_mmu_base,
+        vis_mmu_mixcot,
+        vis_mmu_max,
+        # 新增的返回值，用于更新 thinking_mode 状态和按钮
+        current_thinking_mode_lm_state, # 直接返回值给 gr.State
+        update_think_button_lm,         # gr.update 对象给 gr.Button
+        current_thinking_mode_mmu_state,
+        update_think_button_mmu
+    )
 def get_highlighted_text_tuples(current_x_ids_batch, prompt_input_ids, prompt_len, tk, current_mask_id, raw_prompt_attention_mask):
         model_select_radio = gr.Radio(
             label="Select Text Generation Model",
             choices=MODEL_CHOICES,
+            value="MMaDA-8B-MixCoT"
         )
         model_load_status_box = gr.Textbox(
             label="Model Load Status",
             output_final_text_box_lm = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    with gr.Column(visible=False) as examples_lm_base:
+        gr.Examples(
+            examples=[
+                ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
+                ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
+            ],
+            inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
+            outputs=[output_visualization_box_lm, output_final_text_box_lm],
+            fn=generate_viz_wrapper_lm,
+            cache_examples=False
+        )
+    with gr.Column(visible=True) as examples_lm_mixcot:
+        gr.Examples(
+            examples=[
+                ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
+                ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
+            ],
+            inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
+            outputs=[output_visualization_box_lm, output_final_text_box_lm],
+            fn=generate_viz_wrapper_lm,
+            cache_examples=False
+        )
+    with gr.Column(visible=False) as examples_lm_max:
+        gr.Examples(
+            examples=[
+                ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
+                ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
+            ],
+            inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
+            outputs=[output_visualization_box_lm, output_final_text_box_lm],
+            fn=generate_viz_wrapper_lm,
+            cache_examples=False
+        )
     gr.Markdown("---")
     gr.Markdown("## Part 2. Multimodal Understanding")
             prompt_input_box_mmu = gr.Textbox(
                 label="Enter your prompt:",
                 lines=3,
+                value=""
             )
             think_button_mmu = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
             with gr.Accordion("Generation Parameters", open=True):
                     gen_length_slider_mmu = gr.Slider(minimum=64, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
                     steps_slider_mmu = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
                 with gr.Row():
+                    block_length_slider_mmu = gr.Slider(minimum=32, maximum=1024, value=64, step=32, label="Block Length", info="gen_length must be divisible by this.")
                     remasking_dropdown_mmu = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
                 with gr.Row():
                     cfg_scale_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
             gr.Markdown("## Final Generated Text")
             output_final_text_box_mmu = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    with gr.Column(visible=False) as examples_mmu_base:
+        gr.Examples(
+            examples=[
+                [
+                    "figs/sunflower.jpg",
+                    "Please describe this image in detail.",
+                    256,
+                    512,
+                    128,
+                    1,
+                    0,
+                    "low_confidence"
+                ],
+                [
+                    "figs/woman.jpg",
+                    "Please describe this image in detail.",
+                    256,
+                    512,
+                    128,
+                    1,
+                    0,
+                    "low_confidence"
+                ]
             ],
+            inputs=[
+                image_upload_box,
+                prompt_input_box_mmu,
+                steps_slider_mmu,
+                gen_length_slider_mmu,
+                block_length_slider_mmu,
+                temperature_slider_mmu,
+                cfg_scale_slider_mmu,
+                remasking_dropdown_mmu
+            ],
+            outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
+            fn=generate_viz_wrapper,
+            cache_examples=False
+        )
+    with gr.Column(visible=True) as examples_mmu_mixcot:
+        gr.Examples(
+            examples=[
+                [
+                    "figs/geo.png",
+                    "In the given figure, a square ABCD is inscribed in a circle with center O. Point P is located on side CD. What is the value of angle APB?",
+                    256,
+                    512,
+                    64,
+                    1,
+                    0,
+                    "low_confidence"
+                ],
+                [
+                    "figs/bus.jpg",
+                    "What are the colors of the bus?",
+                    256,
+                    512,
+                    64,
+                    1,
+                    0,
+                    "low_confidence"
+                ]
+            ],
+            inputs=[
+                image_upload_box,
+                prompt_input_box_mmu,
+                steps_slider_mmu,
+                gen_length_slider_mmu,
+                block_length_slider_mmu,
+                temperature_slider_mmu,
+                cfg_scale_slider_mmu,
+                remasking_dropdown_mmu
+            ],
+            outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
+            fn=generate_viz_wrapper,
+            cache_examples=False
+        )
+    with gr.Column(visible=False) as examples_mmu_max:
+        gr.Examples(
+            examples=[
+                [
+                    "figs/sunflower.jpg",
+                    "Please describe this image in detail.",
+                    256,
+                    512,
+                    128,
+                    1,
+                    0,
+                    "low_confidence"
+                ],
+                [
+                    "figs/woman.jpg",
+                    "Please describe this image in detail.",
+                    256,
+                    512,
+                    128,
+                    1,
+                    0,
+                    "low_confidence"
+                ]
+            ],
+            inputs=[
+                image_upload_box,
+                prompt_input_box_mmu,
+                steps_slider_mmu,
+                gen_length_slider_mmu,
+                block_length_slider_mmu,
+                temperature_slider_mmu,
+                cfg_scale_slider_mmu,
+                remasking_dropdown_mmu
+            ],
+            outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
+            fn=generate_viz_wrapper,
+            cache_examples=False
+        )
     gr.Markdown("---")
     gr.Markdown("## Part 3. Text-to-Image Generation")
         inputs=[thinking_mode_mmu],
         outputs=[thinking_mode_mmu, think_button_mmu]
     )
+    def initialize_app_state():
+        default_model_choice = "MMaDA-8B-MixCoT" # 默认加载 MixCoT
+        # handle_model_selection_change 现在返回更多项
+        status, lm_b_vis, lm_m_vis, lm_x_vis, \
+        mmu_b_vis, mmu_m_vis, mmu_x_vis, \
+        init_thinking_lm_state, init_think_lm_btn_update, \
+        init_thinking_mmu_state, init_think_mmu_btn_update = handle_model_selection_change(default_model_choice)
+        return (
+            default_model_choice,
+            status,
+            lm_b_vis,
+            lm_m_vis,
+            lm_x_vis,
+            mmu_b_vis,
+            mmu_m_vis,
+            mmu_x_vis,
+            init_thinking_lm_state,
+            init_think_lm_btn_update,
+            init_thinking_mmu_state,
+            init_think_mmu_btn_update
+        )
     demo.load(
+        fn=initialize_app_state,
         inputs=None,
+        outputs=[
+            model_select_radio,
+            model_load_status_box,
+            examples_lm_base,
+            examples_lm_mixcot,
+            examples_lm_max,
+            examples_mmu_base,
+            examples_mmu_mixcot,
+            examples_mmu_max,
+            thinking_mode_lm,       # gr.State for LM thinking mode
+            think_button_lm,        # gr.Button for LM thinking mode
+            thinking_mode_mmu,      # gr.State for MMU thinking mode
+            think_button_mmu        # gr.Button for MMU thinking mode
+        ],
         queue=True
     )
+    model_select_radio.change(
+        fn=handle_model_selection_change,
+        inputs=[model_select_radio],
+        outputs=[
+            model_load_status_box,
+            examples_lm_base,
+            examples_lm_mixcot,
+            examples_lm_max,
+            examples_mmu_base,
+            examples_mmu_mixcot,
+            examples_mmu_max,
+            thinking_mode_lm,
+            think_button_lm,
+            thinking_mode_mmu,
+            think_button_mmu
+        ]
+    )
     def clear_outputs():
         return None, None, None  # Clear image, visualization, and final text