Spaces:

KingNish
/

Bagel-7B-Demo

Paused

App Files Files Community

KingNish commited on 5 days ago

Commit

dbd2fc4

verified ·

1 Parent(s): 397bb2f

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -228

app.py CHANGED Viewed

@@ -178,13 +178,13 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
     result = {"text": "", "image": None}
     # Call inferencer with or without think parameter based on user choice
     for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
-        # print(type(i)) # For debugging stream
         if type(i) == str:
             result["text"] += i
         else:
             result["image"] = i
-        yield result["image"], result.get("text", "")
 # Image Understanding function with thinking option and hyperparameters
@@ -192,8 +192,7 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
 def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
                         do_sample=False, text_temperature=0.3, max_new_tokens=512):
     if image is None:
-        yield "Please upload an image for understanding."
-        return
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
@@ -204,24 +203,22 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
     inference_hyper = dict(
         do_sample=do_sample,
         temperature=text_temperature,
-        max_think_token_n=max_new_tokens, # Set max_length for text generation
     )
-    result_text = ""
     # Use show_thinking parameter to control thinking process
     for i in inferencer(image=image, text=prompt, think=show_thinking,
                         understanding_output=True, **inference_hyper):
         if type(i) == str:
-            result_text += i
-            yield result_text
-        # else: This branch seems unused in original, as understanding_output=True typically yields text.
-        #      If it yielded image, it would be an intermediate. For final output, it's text.
-        #      For now, we assume it only yields text.
-    yield result_text # Ensure final text is yielded
 # Image Editing function with thinking option and hyperparameters
-@spaces.GPU(duration=90)
 def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
               cfg_img_scale=2.0, cfg_interval=0.0,
               timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
@@ -231,8 +228,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
     set_seed(seed)
     if image is None:
-        yield None, "Please upload an image for editing." # Yield tuple for image/text
-        return
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
@@ -261,7 +257,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
         else:
             result["image"] = i
-        yield result["image"], result.get("text", "") # Yield tuple for image/text
 # Helper function to load example images
 def load_example_image(image_path):
@@ -271,232 +267,201 @@ def load_example_image(image_path):
         print(f"Error loading example image: {e}")
         return None
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("""
-    <div>
-      <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
-    </div>
-    # BAGEL Multimodal Chatbot
-    Interact with BAGEL to generate images from text, edit existing images, or understand image content.
-    """)
-    # Chatbot display area
-    chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(None, "https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/BAGEL_favicon.png"))
-    # Input area
-    with gr.Row():
-        image_input = gr.Image(type="pil", label="Optional: Upload an Image (for Image Understanding/Edit)", scale=0.5, value=None)
-        with gr.Column(scale=1.5):
-            user_prompt = gr.Textbox(label="Your Message", placeholder="Type your prompt here...", lines=3)
-            with gr.Row():
-                mode_selector = gr.Radio(
-                    choices=["Text to Image", "Image Understanding", "Image Edit"],
-                    value="Text to Image",
-                    label="Select Mode",
-                    interactive=True
-                )
-                submit_btn = gr.Button("Send", variant="primary")
-    # Global/Shared Hyperparameters
-    with gr.Accordion("General Settings & Hyperparameters", open=False) as general_accordion:
         with gr.Row():
-            show_thinking_global = gr.Checkbox(label="Show Thinking Process", value=False, info="Enable to see model's intermediate thinking text.")
-            seed_global = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, label="Seed", info="0 for random seed, positive for reproducible results.")
-        # Container for thinking-specific parameters, visibility controlled by show_thinking_global
-        thinking_params_container = gr.Group(visible=False)
-        with thinking_params_container:
-            gr.Markdown("#### Thinking Process Parameters (affect text generation)")
-            with gr.Row():
-                common_do_sample = gr.Checkbox(label="Enable Sampling", value=False, info="Enable sampling for text generation (otherwise greedy).")
-                common_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Text Temperature", info="Controls randomness in text generation (higher = more random).")
-                common_max_think_token_n = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Think Tokens / Max New Tokens", info="Maximum number of tokens for thinking (T2I/Edit) or generated text (Understanding).")
-    # T2I Hyperparameters
-    t2i_params_accordion = gr.Accordion("Text to Image Specific Parameters", open=False)
-    with t2i_params_accordion:
-        gr.Markdown("#### Text to Image Parameters")
-        with gr.Row():
-            t2i_image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"], value="1:1", label="Image Ratio", info="The longer size is fixed to 1024 pixels.")
-        with gr.Row():
-            t2i_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0 recommended).")
-            t2i_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="CFG Interval", info="Start of Classifier-Free Guidance application interval (end is fixed at 1.0).")
-        with gr.Row():
-            t2i_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="global", label="CFG Renorm Type", info="Normalization type for CFG. Use 'global' if the generated image is blurry.")
-            t2i_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization (1.0 disables CFG-Renorm).")
-        with gr.Row():
-            t2i_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image generation.")
-            t2i_timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details.")
-    # Image Edit Hyperparameters
-    edit_params_accordion = gr.Accordion("Image Edit Specific Parameters", open=False)
-    with edit_params_accordion:
-        gr.Markdown("#### Image Edit Parameters")
-        with gr.Row():
-            edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt for editing.")
-            edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, label="CFG Image Scale", info="Controls how much the model preserves input image details during editing.")
         with gr.Row():
-            edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Interval", info="Start of CFG application interval for editing (end is fixed at 1.0).")
-            edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="text_channel", label="CFG Renorm Type", info="Normalization type for CFG during editing. Use 'global' if output is blurry.")
-        with gr.Row():
-            edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization during editing (1.0 disables CFG-Renorm).")
         with gr.Row():
-            edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image editing.")
-            edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details during editing.")
-    # Main chat processing function
-    @spaces.GPU(duration=90) # Apply GPU decorator to the combined function
-    def process_chat_message(history, prompt, uploaded_image, mode,
-                             show_thinking_global_val, seed_global_val,
-                             common_do_sample_val, common_text_temperature_val, common_max_think_token_n_val,
-                             t2i_cfg_text_scale_val, t2i_cfg_interval_val, t2i_timestep_shift_val,
-                             t2i_num_timesteps_val, t2i_cfg_renorm_min_val, t2i_cfg_renorm_type_val,
-                             t2i_image_ratio_val,
-                             edit_cfg_text_scale_val, edit_cfg_img_scale_val, edit_cfg_interval_val,
-                             edit_timestep_shift_val, edit_num_timesteps_val, edit_cfg_renorm_min_val,
-                             edit_cfg_renorm_type_val):
-        # Append user message to history
-        history.append([prompt, None])
-        # Define common parameters for inference functions
-        common_infer_params = dict(
-            show_thinking=show_thinking_global_val,
-            do_sample=common_do_sample_val,
-            text_temperature=common_text_temperature_val,
-        )
-        try:
-            if mode == "Text to Image":
-                # Add T2I specific parameters, including max_think_token_n and seed
-                t2i_params = {
-                    **common_infer_params,
-                    "max_think_token_n": common_max_think_token_n_val,
-                    "seed": seed_global_val,
-                    "cfg_text_scale": t2i_cfg_text_scale_val,
-                    "cfg_interval": t2i_cfg_interval_val,
-                    "timestep_shift": t2i_timestep_shift_val,
-                    "num_timesteps": t2i_num_timesteps_val,
-                    "cfg_renorm_min": t2i_cfg_renorm_min_val,
-                    "cfg_renorm_type": t2i_cfg_renorm_type_val,
-                    "image_ratio": t2i_image_ratio_val,
-                }
-                for img, txt in text_to_image(
-                    prompt=prompt,
-                    **t2i_params
-                ):
-                    # For Text to Image, yield image first, then thinking text (if available)
-                    if img is not None:
-                        history[-1] = [prompt, (img, txt)]
-                    elif txt: # Only update text if image is not ready yet
-                        history[-1] = [prompt, txt]
-                    yield history, gr.update(value="") # Update chatbot and clear input
-            elif mode == "Image Understanding":
-                if uploaded_image is None:
-                    history[-1] = [prompt, "Please upload an image for Image Understanding."]
-                    yield history, gr.update(value="")
-                    return
-                # Add Understanding specific parameters (max_new_tokens maps to common_max_think_token_n)
-                # Note: seed is not used in image_understanding
-                understand_params = {
-                    **common_infer_params,
-                    "max_new_tokens": common_max_think_token_n_val,
-                }
-                # Remove seed from parameters as it's not used by image_understanding
-                understand_params.pop('seed', None)
-                for txt in image_understanding(
-                    image=uploaded_image,
-                    prompt=prompt,
-                    **understand_params
-                ):
-                    history[-1] = [prompt, txt]
-                    yield history, gr.update(value="")
-            elif mode == "Image Edit":
-                if uploaded_image is None:
-                    history[-1] = [prompt, "Please upload an image for Image Editing."]
-                    yield history, gr.update(value="")
-                    return
-                # Add Edit specific parameters, including max_think_token_n and seed
-                edit_params = {
-                    **common_infer_params,
-                    "max_think_token_n": common_max_think_token_n_val,
-                    "seed": seed_global_val,
-                    "cfg_text_scale": edit_cfg_text_scale_val,
-                    "cfg_img_scale": edit_cfg_img_scale_val,
-                    "cfg_interval": edit_cfg_interval_val,
-                    "timestep_shift": edit_timestep_shift_val,
-                    "num_timesteps": edit_num_timesteps_val,
-                    "cfg_renorm_min": edit_cfg_renorm_min_val,
-                    "cfg_renorm_type": edit_cfg_renorm_type_val,
-                }
-                for img, txt in edit_image(
-                    image=uploaded_image,
-                    prompt=prompt,
-                    **edit_params
-                ):
-                    # For Image Edit, yield image first, then thinking text (if available)
-                    if img is not None:
-                        history[-1] = [prompt, (img, txt)]
-                    elif txt: # Only update text if image is not ready yet
-                        history[-1] = [prompt, txt]
-                    yield history, gr.update(value="")
-        except Exception as e:
-            history[-1] = [prompt, f"An error occurred: {e}"]
-            yield history, gr.update(value="") # Update history with error and clear input
-    # Event handlers for dynamic UI updates and submission
-    # Control visibility of thinking parameters
-    show_thinking_global.change(
-        fn=lambda x: gr.update(visible=x),
-        inputs=[show_thinking_global],
-        outputs=[thinking_params_container]
-    )
-    # Clear image input if mode switches to Text to Image
-    mode_selector.change(
-        fn=lambda mode: gr.update(value=None) if mode == "Text to Image" else gr.update(),
-        inputs=[mode_selector],
-        outputs=[image_input]
-    )
-    # List of all input components whose values are passed to process_chat_message
-    inputs_list = [
-        chatbot, user_prompt, image_input, mode_selector,
-        show_thinking_global, seed_global,
-        common_do_sample, common_text_temperature, common_max_think_token_n,
-        t2i_cfg_text_scale, t2i_cfg_interval, t2i_timestep_shift,
-        t2i_num_timesteps, t2i_cfg_renorm_min, t2i_cfg_renorm_type,
-        t2i_image_ratio,
-        edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
-        edit_timestep_shift, edit_num_timesteps, edit_cfg_renorm_min,
-        edit_cfg_renorm_type
-    ]
-    # Link submit button and text input 'Enter' key to the processing function
-    submit_btn.click(
-        fn=process_chat_message,
-        inputs=inputs_list,
-        outputs=[chatbot, user_prompt],
-        scroll_to_output=True,
-        queue=False, # Set to True if long generation times cause issues, but might affect responsiveness
-    )
-    user_prompt.submit( # Allows pressing Enter in textbox to submit
-        fn=process_chat_message,
-        inputs=inputs_list,
-        outputs=[chatbot, user_prompt],
-        scroll_to_output=True,
-        queue=False,
-    )
 demo.launch()

     result = {"text": "", "image": None}
     # Call inferencer with or without think parameter based on user choice
     for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
+        print(type(i))
         if type(i) == str:
             result["text"] += i
         else:
             result["image"] = i
+        yield result["image"], result.get("text", None)
 # Image Understanding function with thinking option and hyperparameters
 def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
                         do_sample=False, text_temperature=0.3, max_new_tokens=512):
     if image is None:
+        return "Please upload an image."
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
     inference_hyper = dict(
         do_sample=do_sample,
         temperature=text_temperature,
+        max_think_token_n=max_new_tokens, # Set max_length
     )
+    result = {"text": "", "image": None}
     # Use show_thinking parameter to control thinking process
     for i in inferencer(image=image, text=prompt, think=show_thinking,
                         understanding_output=True, **inference_hyper):
         if type(i) == str:
+            result["text"] += i
+        else:
+            result["image"] = i
+        yield result["text"]
 # Image Editing function with thinking option and hyperparameters
+@spaces.GPU(duration=120)
 def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
               cfg_img_scale=2.0, cfg_interval=0.0,
               timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
     set_seed(seed)
     if image is None:
+        return "Please upload an image.", ""
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
         else:
             result["image"] = i
+        yield result["image"], result.get("text", "")
 # Helper function to load example images
 def load_example_image(image_path):
         print(f"Error loading example image: {e}")
         return None
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("""
+<div>
+  <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
+</div>
+""")
+    with gr.Tab("📝 Text to Image"):
+        txt_input = gr.Textbox(
+            label="Prompt",
+            value="A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
+        )
         with gr.Row():
+            show_thinking = gr.Checkbox(label="Thinking", value=False)
+        # Add hyperparameter controls in an accordion
+        with gr.Accordion("Inference Hyperparameters", open=False):
+            # 参数一排两个布局
+            with gr.Group():
+                with gr.Row():
+                    seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1,
+                                   label="Seed", info="0 for random seed, positive for reproducible results")
+                    image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"],
+                                                value="1:1", label="Image Ratio",
+                                                info="The longer size is fixed to 1024")
+                with gr.Row():
+                    cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
+                                             label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0)")
+                    cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1,
+                                           label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
+                with gr.Row():
+                    cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
+                                                value="global", label="CFG Renorm Type",
+                                                info="If the genrated image is blurry, use 'global'")
+                    cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
+                                             label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
+                with gr.Row():
+                    num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
+                                            label="Timesteps", info="Total denoising steps")
+                    timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, interactive=True,
+                                             label="Timestep Shift", info="Higher values for layout, lower for details")
+                # Thinking parameters in a single row
+                thinking_params = gr.Group(visible=False)
+                with thinking_params:
+                    with gr.Row():
+                        do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
+                        max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
+                                                    label="Max Think Tokens", info="Maximum number of tokens for thinking")
+                        text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
+                                                  label="Temperature", info="Controls randomness in text generation")
+        thinking_output = gr.Textbox(label="Thinking Process", visible=False)
+        img_output = gr.Image(label="Generated Image")
+        gen_btn = gr.Button("Generate")
+        # Dynamically show/hide thinking process box and parameters
+        def update_thinking_visibility(show):
+            return gr.update(visible=show), gr.update(visible=show)
+        show_thinking.change(
+            fn=update_thinking_visibility,
+            inputs=[show_thinking],
+            outputs=[thinking_output, thinking_params]
+        )
+        gen_btn.click(
+            fn=text_to_image,
+            inputs=[
+                txt_input, show_thinking, cfg_text_scale,
+                cfg_interval, timestep_shift,
+                num_timesteps, cfg_renorm_min, cfg_renorm_type,
+                max_think_token_n, do_sample, text_temperature, seed, image_ratio
+            ],
+            outputs=[img_output, thinking_output]
+        )
+    with gr.Tab("🖌️ Image Edit"):
         with gr.Row():
+            with gr.Column(scale=1):
+                edit_image_input = gr.Image(label="Input Image", value=load_example_image('test_images/women.jpg'))
+                edit_prompt = gr.Textbox(
+                    label="Prompt",
+                    value="She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes."
+                )
+            with gr.Column(scale=1):
+                edit_image_output = gr.Image(label="Result")
+                edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
         with gr.Row():
+            edit_show_thinking = gr.Checkbox(label="Thinking", value=False)
+        # Add hyperparameter controls in an accordion
+        with gr.Accordion("Inference Hyperparameters", open=False):
+            with gr.Group():
+                with gr.Row():
+                    edit_seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, interactive=True,
+                                        label="Seed", info="0 for random seed, positive for reproducible results")
+                    edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
+                                                  label="CFG Text Scale", info="Controls how strongly the model follows the text prompt")
+                with gr.Row():
+                    edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, interactive=True,
+                                                 label="CFG Image Scale", info="Controls how much the model preserves input image details")
+                    edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
+                                                label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
+                with gr.Row():
+                    edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
+                                                     value="text_channel", label="CFG Renorm Type",
+                                                     info="If the genrated image is blurry, use 'global")
+                    edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
+                                                  label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
+                with gr.Row():
+                    edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
+                                                 label="Timesteps", info="Total denoising steps")
+                    edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, interactive=True,
+                                                  label="Timestep Shift", info="Higher values for layout, lower for details")
+                # Thinking parameters in a single row
+                edit_thinking_params = gr.Group(visible=False)
+                with edit_thinking_params:
+                    with gr.Row():
+                        edit_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
+                        edit_max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
+                                                         label="Max Think Tokens", info="Maximum number of tokens for thinking")
+                        edit_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
+                                                        label="Temperature", info="Controls randomness in text generation")
+        edit_btn = gr.Button("Submit")
+        # Dynamically show/hide thinking process box for editing
+        def update_edit_thinking_visibility(show):
+            return gr.update(visible=show), gr.update(visible=show)
+        edit_show_thinking.change(
+            fn=update_edit_thinking_visibility,
+            inputs=[edit_show_thinking],
+            outputs=[edit_thinking_output, edit_thinking_params]
+        )
+        edit_btn.click(
+            fn=edit_image,
+            inputs=[
+                edit_image_input, edit_prompt, edit_show_thinking,
+                edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
+                edit_timestep_shift, edit_num_timesteps,
+                edit_cfg_renorm_min, edit_cfg_renorm_type,
+                edit_max_think_token_n, edit_do_sample, edit_text_temperature, edit_seed
+            ],
+            outputs=[edit_image_output, edit_thinking_output]
+        )
+    with gr.Tab("🖼️ Image Understanding"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                img_input = gr.Image(label="Input Image", value=load_example_image('test_images/meme.jpg'))
+                understand_prompt = gr.Textbox(
+                    label="Prompt",
+                    value="Can someone explain what's funny about this meme??"
+                )
+            with gr.Column(scale=1):
+                txt_output = gr.Textbox(label="Result", lines=20)
+        with gr.Row():
+            understand_show_thinking = gr.Checkbox(label="Thinking", value=False)
+        # Add hyperparameter controls in an accordion
+        with gr.Accordion("Inference Hyperparameters", open=False):
+            with gr.Row():
+                understand_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
+                understand_text_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, interactive=True,
+                                                     label="Temperature", info="Controls randomness in text generation (0=deterministic, 1=creative)")
+                understand_max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=512, step=64, interactive=True,
+                                                   label="Max New Tokens", info="Maximum length of generated text, including potential thinking")
+        img_understand_btn = gr.Button("Submit")
+        img_understand_btn.click(
+            fn=image_understanding,
+            inputs=[
+                img_input, understand_prompt, understand_show_thinking,
+                understand_do_sample, understand_text_temperature, understand_max_new_tokens
+            ],
+            outputs=txt_output
+        )
 demo.launch()