KingNish commited on
Commit
dbd2fc4
·
verified ·
1 Parent(s): 397bb2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -228
app.py CHANGED
@@ -178,13 +178,13 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
178
  result = {"text": "", "image": None}
179
  # Call inferencer with or without think parameter based on user choice
180
  for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
181
- # print(type(i)) # For debugging stream
182
  if type(i) == str:
183
  result["text"] += i
184
  else:
185
  result["image"] = i
186
 
187
- yield result["image"], result.get("text", "")
188
 
189
 
190
  # Image Understanding function with thinking option and hyperparameters
@@ -192,8 +192,7 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
192
  def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
193
  do_sample=False, text_temperature=0.3, max_new_tokens=512):
194
  if image is None:
195
- yield "Please upload an image for understanding."
196
- return
197
 
198
  if isinstance(image, np.ndarray):
199
  image = Image.fromarray(image)
@@ -204,24 +203,22 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
204
  inference_hyper = dict(
205
  do_sample=do_sample,
206
  temperature=text_temperature,
207
- max_think_token_n=max_new_tokens, # Set max_length for text generation
208
  )
209
 
210
- result_text = ""
211
  # Use show_thinking parameter to control thinking process
212
  for i in inferencer(image=image, text=prompt, think=show_thinking,
213
  understanding_output=True, **inference_hyper):
214
  if type(i) == str:
215
- result_text += i
216
- yield result_text
217
- # else: This branch seems unused in original, as understanding_output=True typically yields text.
218
- # If it yielded image, it would be an intermediate. For final output, it's text.
219
- # For now, we assume it only yields text.
220
- yield result_text # Ensure final text is yielded
221
 
222
 
223
  # Image Editing function with thinking option and hyperparameters
224
- @spaces.GPU(duration=90)
225
  def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
226
  cfg_img_scale=2.0, cfg_interval=0.0,
227
  timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
@@ -231,8 +228,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
231
  set_seed(seed)
232
 
233
  if image is None:
234
- yield None, "Please upload an image for editing." # Yield tuple for image/text
235
- return
236
 
237
  if isinstance(image, np.ndarray):
238
  image = Image.fromarray(image)
@@ -261,7 +257,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
261
  else:
262
  result["image"] = i
263
 
264
- yield result["image"], result.get("text", "") # Yield tuple for image/text
265
 
266
  # Helper function to load example images
267
  def load_example_image(image_path):
@@ -271,232 +267,201 @@ def load_example_image(image_path):
271
  print(f"Error loading example image: {e}")
272
  return None
273
 
 
274
  # Gradio UI
275
  with gr.Blocks() as demo:
276
  gr.Markdown("""
277
- <div>
278
- <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
279
- </div>
280
- # BAGEL Multimodal Chatbot
281
- Interact with BAGEL to generate images from text, edit existing images, or understand image content.
282
- """)
283
-
284
- # Chatbot display area
285
- chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(None, "https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/BAGEL_favicon.png"))
286
-
287
- # Input area
288
- with gr.Row():
289
- image_input = gr.Image(type="pil", label="Optional: Upload an Image (for Image Understanding/Edit)", scale=0.5, value=None)
290
 
291
- with gr.Column(scale=1.5):
292
- user_prompt = gr.Textbox(label="Your Message", placeholder="Type your prompt here...", lines=3)
293
-
294
- with gr.Row():
295
- mode_selector = gr.Radio(
296
- choices=["Text to Image", "Image Understanding", "Image Edit"],
297
- value="Text to Image",
298
- label="Select Mode",
299
- interactive=True
300
- )
301
- submit_btn = gr.Button("Send", variant="primary")
302
-
303
- # Global/Shared Hyperparameters
304
- with gr.Accordion("General Settings & Hyperparameters", open=False) as general_accordion:
305
  with gr.Row():
306
- show_thinking_global = gr.Checkbox(label="Show Thinking Process", value=False, info="Enable to see model's intermediate thinking text.")
307
- seed_global = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, label="Seed", info="0 for random seed, positive for reproducible results.")
308
 
309
- # Container for thinking-specific parameters, visibility controlled by show_thinking_global
310
- thinking_params_container = gr.Group(visible=False)
311
- with thinking_params_container:
312
- gr.Markdown("#### Thinking Process Parameters (affect text generation)")
313
- with gr.Row():
314
- common_do_sample = gr.Checkbox(label="Enable Sampling", value=False, info="Enable sampling for text generation (otherwise greedy).")
315
- common_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Text Temperature", info="Controls randomness in text generation (higher = more random).")
316
- common_max_think_token_n = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Think Tokens / Max New Tokens", info="Maximum number of tokens for thinking (T2I/Edit) or generated text (Understanding).")
317
-
318
- # T2I Hyperparameters
319
- t2i_params_accordion = gr.Accordion("Text to Image Specific Parameters", open=False)
320
- with t2i_params_accordion:
321
- gr.Markdown("#### Text to Image Parameters")
322
- with gr.Row():
323
- t2i_image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"], value="1:1", label="Image Ratio", info="The longer size is fixed to 1024 pixels.")
324
- with gr.Row():
325
- t2i_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0 recommended).")
326
- t2i_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="CFG Interval", info="Start of Classifier-Free Guidance application interval (end is fixed at 1.0).")
327
- with gr.Row():
328
- t2i_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="global", label="CFG Renorm Type", info="Normalization type for CFG. Use 'global' if the generated image is blurry.")
329
- t2i_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization (1.0 disables CFG-Renorm).")
330
- with gr.Row():
331
- t2i_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image generation.")
332
- t2i_timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
- # Image Edit Hyperparameters
335
- edit_params_accordion = gr.Accordion("Image Edit Specific Parameters", open=False)
336
- with edit_params_accordion:
337
- gr.Markdown("#### Image Edit Parameters")
338
- with gr.Row():
339
- edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt for editing.")
340
- edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, label="CFG Image Scale", info="Controls how much the model preserves input image details during editing.")
341
  with gr.Row():
342
- edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Interval", info="Start of CFG application interval for editing (end is fixed at 1.0).")
343
- edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="text_channel", label="CFG Renorm Type", info="Normalization type for CFG during editing. Use 'global' if output is blurry.")
344
- with gr.Row():
345
- edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization during editing (1.0 disables CFG-Renorm).")
 
 
 
 
 
 
 
346
  with gr.Row():
347
- edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image editing.")
348
- edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details during editing.")
349
-
350
- # Main chat processing function
351
- @spaces.GPU(duration=90) # Apply GPU decorator to the combined function
352
- def process_chat_message(history, prompt, uploaded_image, mode,
353
- show_thinking_global_val, seed_global_val,
354
- common_do_sample_val, common_text_temperature_val, common_max_think_token_n_val,
355
- t2i_cfg_text_scale_val, t2i_cfg_interval_val, t2i_timestep_shift_val,
356
- t2i_num_timesteps_val, t2i_cfg_renorm_min_val, t2i_cfg_renorm_type_val,
357
- t2i_image_ratio_val,
358
- edit_cfg_text_scale_val, edit_cfg_img_scale_val, edit_cfg_interval_val,
359
- edit_timestep_shift_val, edit_num_timesteps_val, edit_cfg_renorm_min_val,
360
- edit_cfg_renorm_type_val):
361
-
362
- # Append user message to history
363
- history.append([prompt, None])
364
 
365
- # Define common parameters for inference functions
366
- common_infer_params = dict(
367
- show_thinking=show_thinking_global_val,
368
- do_sample=common_do_sample_val,
369
- text_temperature=common_text_temperature_val,
370
- )
371
-
372
- try:
373
- if mode == "Text to Image":
374
- # Add T2I specific parameters, including max_think_token_n and seed
375
- t2i_params = {
376
- **common_infer_params,
377
- "max_think_token_n": common_max_think_token_n_val,
378
- "seed": seed_global_val,
379
- "cfg_text_scale": t2i_cfg_text_scale_val,
380
- "cfg_interval": t2i_cfg_interval_val,
381
- "timestep_shift": t2i_timestep_shift_val,
382
- "num_timesteps": t2i_num_timesteps_val,
383
- "cfg_renorm_min": t2i_cfg_renorm_min_val,
384
- "cfg_renorm_type": t2i_cfg_renorm_type_val,
385
- "image_ratio": t2i_image_ratio_val,
386
- }
387
 
388
- for img, txt in text_to_image(
389
- prompt=prompt,
390
- **t2i_params
391
- ):
392
- # For Text to Image, yield image first, then thinking text (if available)
393
- if img is not None:
394
- history[-1] = [prompt, (img, txt)]
395
- elif txt: # Only update text if image is not ready yet
396
- history[-1] = [prompt, txt]
397
- yield history, gr.update(value="") # Update chatbot and clear input
398
-
399
- elif mode == "Image Understanding":
400
- if uploaded_image is None:
401
- history[-1] = [prompt, "Please upload an image for Image Understanding."]
402
- yield history, gr.update(value="")
403
- return
404
 
405
- # Add Understanding specific parameters (max_new_tokens maps to common_max_think_token_n)
406
- # Note: seed is not used in image_understanding
407
- understand_params = {
408
- **common_infer_params,
409
- "max_new_tokens": common_max_think_token_n_val,
410
- }
411
- # Remove seed from parameters as it's not used by image_understanding
412
- understand_params.pop('seed', None)
413
 
414
- for txt in image_understanding(
415
- image=uploaded_image,
416
- prompt=prompt,
417
- **understand_params
418
- ):
419
- history[-1] = [prompt, txt]
420
- yield history, gr.update(value="")
421
-
422
- elif mode == "Image Edit":
423
- if uploaded_image is None:
424
- history[-1] = [prompt, "Please upload an image for Image Editing."]
425
- yield history, gr.update(value="")
426
- return
427
 
428
- # Add Edit specific parameters, including max_think_token_n and seed
429
- edit_params = {
430
- **common_infer_params,
431
- "max_think_token_n": common_max_think_token_n_val,
432
- "seed": seed_global_val,
433
- "cfg_text_scale": edit_cfg_text_scale_val,
434
- "cfg_img_scale": edit_cfg_img_scale_val,
435
- "cfg_interval": edit_cfg_interval_val,
436
- "timestep_shift": edit_timestep_shift_val,
437
- "num_timesteps": edit_num_timesteps_val,
438
- "cfg_renorm_min": edit_cfg_renorm_min_val,
439
- "cfg_renorm_type": edit_cfg_renorm_type_val,
440
- }
441
-
442
- for img, txt in edit_image(
443
- image=uploaded_image,
444
- prompt=prompt,
445
- **edit_params
446
- ):
447
- # For Image Edit, yield image first, then thinking text (if available)
448
- if img is not None:
449
- history[-1] = [prompt, (img, txt)]
450
- elif txt: # Only update text if image is not ready yet
451
- history[-1] = [prompt, txt]
452
- yield history, gr.update(value="")
453
-
454
- except Exception as e:
455
- history[-1] = [prompt, f"An error occurred: {e}"]
456
- yield history, gr.update(value="") # Update history with error and clear input
457
-
458
- # Event handlers for dynamic UI updates and submission
459
- # Control visibility of thinking parameters
460
- show_thinking_global.change(
461
- fn=lambda x: gr.update(visible=x),
462
- inputs=[show_thinking_global],
463
- outputs=[thinking_params_container]
464
- )
465
-
466
- # Clear image input if mode switches to Text to Image
467
- mode_selector.change(
468
- fn=lambda mode: gr.update(value=None) if mode == "Text to Image" else gr.update(),
469
- inputs=[mode_selector],
470
- outputs=[image_input]
471
- )
472
 
473
- # List of all input components whose values are passed to process_chat_message
474
- inputs_list = [
475
- chatbot, user_prompt, image_input, mode_selector,
476
- show_thinking_global, seed_global,
477
- common_do_sample, common_text_temperature, common_max_think_token_n,
478
- t2i_cfg_text_scale, t2i_cfg_interval, t2i_timestep_shift,
479
- t2i_num_timesteps, t2i_cfg_renorm_min, t2i_cfg_renorm_type,
480
- t2i_image_ratio,
481
- edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
482
- edit_timestep_shift, edit_num_timesteps, edit_cfg_renorm_min,
483
- edit_cfg_renorm_type
484
- ]
485
-
486
- # Link submit button and text input 'Enter' key to the processing function
487
- submit_btn.click(
488
- fn=process_chat_message,
489
- inputs=inputs_list,
490
- outputs=[chatbot, user_prompt],
491
- scroll_to_output=True,
492
- queue=False, # Set to True if long generation times cause issues, but might affect responsiveness
493
- )
494
- user_prompt.submit( # Allows pressing Enter in textbox to submit
495
- fn=process_chat_message,
496
- inputs=inputs_list,
497
- outputs=[chatbot, user_prompt],
498
- scroll_to_output=True,
499
- queue=False,
500
- )
 
 
 
 
 
 
501
 
502
  demo.launch()
 
178
  result = {"text": "", "image": None}
179
  # Call inferencer with or without think parameter based on user choice
180
  for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
181
+ print(type(i))
182
  if type(i) == str:
183
  result["text"] += i
184
  else:
185
  result["image"] = i
186
 
187
+ yield result["image"], result.get("text", None)
188
 
189
 
190
  # Image Understanding function with thinking option and hyperparameters
 
192
  def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
193
  do_sample=False, text_temperature=0.3, max_new_tokens=512):
194
  if image is None:
195
+ return "Please upload an image."
 
196
 
197
  if isinstance(image, np.ndarray):
198
  image = Image.fromarray(image)
 
203
  inference_hyper = dict(
204
  do_sample=do_sample,
205
  temperature=text_temperature,
206
+ max_think_token_n=max_new_tokens, # Set max_length
207
  )
208
 
209
+ result = {"text": "", "image": None}
210
  # Use show_thinking parameter to control thinking process
211
  for i in inferencer(image=image, text=prompt, think=show_thinking,
212
  understanding_output=True, **inference_hyper):
213
  if type(i) == str:
214
+ result["text"] += i
215
+ else:
216
+ result["image"] = i
217
+ yield result["text"]
 
 
218
 
219
 
220
  # Image Editing function with thinking option and hyperparameters
221
+ @spaces.GPU(duration=120)
222
  def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
223
  cfg_img_scale=2.0, cfg_interval=0.0,
224
  timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
 
228
  set_seed(seed)
229
 
230
  if image is None:
231
+ return "Please upload an image.", ""
 
232
 
233
  if isinstance(image, np.ndarray):
234
  image = Image.fromarray(image)
 
257
  else:
258
  result["image"] = i
259
 
260
+ yield result["image"], result.get("text", "")
261
 
262
  # Helper function to load example images
263
  def load_example_image(image_path):
 
267
  print(f"Error loading example image: {e}")
268
  return None
269
 
270
+
271
  # Gradio UI
272
  with gr.Blocks() as demo:
273
  gr.Markdown("""
274
+ <div>
275
+ <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
276
+ </div>
277
+ """)
278
+
279
+ with gr.Tab("📝 Text to Image"):
280
+ txt_input = gr.Textbox(
281
+ label="Prompt",
282
+ value="A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
283
+ )
 
 
 
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  with gr.Row():
286
+ show_thinking = gr.Checkbox(label="Thinking", value=False)
 
287
 
288
+ # Add hyperparameter controls in an accordion
289
+ with gr.Accordion("Inference Hyperparameters", open=False):
290
+ # 参数一排两个布局
291
+ with gr.Group():
292
+ with gr.Row():
293
+ seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1,
294
+ label="Seed", info="0 for random seed, positive for reproducible results")
295
+ image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"],
296
+ value="1:1", label="Image Ratio",
297
+ info="The longer size is fixed to 1024")
298
+
299
+ with gr.Row():
300
+ cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
301
+ label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0)")
302
+ cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1,
303
+ label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
304
+
305
+ with gr.Row():
306
+ cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
307
+ value="global", label="CFG Renorm Type",
308
+ info="If the genrated image is blurry, use 'global'")
309
+ cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
310
+ label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
311
+
312
+ with gr.Row():
313
+ num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
314
+ label="Timesteps", info="Total denoising steps")
315
+ timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, interactive=True,
316
+ label="Timestep Shift", info="Higher values for layout, lower for details")
317
+
318
+ # Thinking parameters in a single row
319
+ thinking_params = gr.Group(visible=False)
320
+ with thinking_params:
321
+ with gr.Row():
322
+ do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
323
+ max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
324
+ label="Max Think Tokens", info="Maximum number of tokens for thinking")
325
+ text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
326
+ label="Temperature", info="Controls randomness in text generation")
327
+
328
+ thinking_output = gr.Textbox(label="Thinking Process", visible=False)
329
+ img_output = gr.Image(label="Generated Image")
330
+ gen_btn = gr.Button("Generate")
331
+
332
+ # Dynamically show/hide thinking process box and parameters
333
+ def update_thinking_visibility(show):
334
+ return gr.update(visible=show), gr.update(visible=show)
335
+
336
+ show_thinking.change(
337
+ fn=update_thinking_visibility,
338
+ inputs=[show_thinking],
339
+ outputs=[thinking_output, thinking_params]
340
+ )
341
+
342
+ gen_btn.click(
343
+ fn=text_to_image,
344
+ inputs=[
345
+ txt_input, show_thinking, cfg_text_scale,
346
+ cfg_interval, timestep_shift,
347
+ num_timesteps, cfg_renorm_min, cfg_renorm_type,
348
+ max_think_token_n, do_sample, text_temperature, seed, image_ratio
349
+ ],
350
+ outputs=[img_output, thinking_output]
351
+ )
352
 
353
+ with gr.Tab("🖌️ Image Edit"):
 
 
 
 
 
 
354
  with gr.Row():
355
+ with gr.Column(scale=1):
356
+ edit_image_input = gr.Image(label="Input Image", value=load_example_image('test_images/women.jpg'))
357
+ edit_prompt = gr.Textbox(
358
+ label="Prompt",
359
+ value="She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes."
360
+ )
361
+
362
+ with gr.Column(scale=1):
363
+ edit_image_output = gr.Image(label="Result")
364
+ edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
365
+
366
  with gr.Row():
367
+ edit_show_thinking = gr.Checkbox(label="Thinking", value=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
+ # Add hyperparameter controls in an accordion
370
+ with gr.Accordion("Inference Hyperparameters", open=False):
371
+ with gr.Group():
372
+ with gr.Row():
373
+ edit_seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, interactive=True,
374
+ label="Seed", info="0 for random seed, positive for reproducible results")
375
+ edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
376
+ label="CFG Text Scale", info="Controls how strongly the model follows the text prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ with gr.Row():
379
+ edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, interactive=True,
380
+ label="CFG Image Scale", info="Controls how much the model preserves input image details")
381
+ edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
382
+ label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
383
+
384
+ with gr.Row():
385
+ edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
386
+ value="text_channel", label="CFG Renorm Type",
387
+ info="If the genrated image is blurry, use 'global")
388
+ edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
389
+ label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
 
 
 
 
390
 
391
+ with gr.Row():
392
+ edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
393
+ label="Timesteps", info="Total denoising steps")
394
+ edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, interactive=True,
395
+ label="Timestep Shift", info="Higher values for layout, lower for details")
 
 
 
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ # Thinking parameters in a single row
399
+ edit_thinking_params = gr.Group(visible=False)
400
+ with edit_thinking_params:
401
+ with gr.Row():
402
+ edit_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
403
+ edit_max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
404
+ label="Max Think Tokens", info="Maximum number of tokens for thinking")
405
+ edit_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
406
+ label="Temperature", info="Controls randomness in text generation")
407
+
408
+ edit_btn = gr.Button("Submit")
409
+
410
+ # Dynamically show/hide thinking process box for editing
411
+ def update_edit_thinking_visibility(show):
412
+ return gr.update(visible=show), gr.update(visible=show)
413
+
414
+ edit_show_thinking.change(
415
+ fn=update_edit_thinking_visibility,
416
+ inputs=[edit_show_thinking],
417
+ outputs=[edit_thinking_output, edit_thinking_params]
418
+ )
419
+
420
+ edit_btn.click(
421
+ fn=edit_image,
422
+ inputs=[
423
+ edit_image_input, edit_prompt, edit_show_thinking,
424
+ edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
425
+ edit_timestep_shift, edit_num_timesteps,
426
+ edit_cfg_renorm_min, edit_cfg_renorm_type,
427
+ edit_max_think_token_n, edit_do_sample, edit_text_temperature, edit_seed
428
+ ],
429
+ outputs=[edit_image_output, edit_thinking_output]
430
+ )
 
 
 
 
 
 
 
 
 
 
 
431
 
432
+ with gr.Tab("🖼️ Image Understanding"):
433
+ with gr.Row():
434
+ with gr.Column(scale=1):
435
+ img_input = gr.Image(label="Input Image", value=load_example_image('test_images/meme.jpg'))
436
+ understand_prompt = gr.Textbox(
437
+ label="Prompt",
438
+ value="Can someone explain what's funny about this meme??"
439
+ )
440
+
441
+ with gr.Column(scale=1):
442
+ txt_output = gr.Textbox(label="Result", lines=20)
443
+
444
+ with gr.Row():
445
+ understand_show_thinking = gr.Checkbox(label="Thinking", value=False)
446
+
447
+ # Add hyperparameter controls in an accordion
448
+ with gr.Accordion("Inference Hyperparameters", open=False):
449
+ with gr.Row():
450
+ understand_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
451
+ understand_text_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, interactive=True,
452
+ label="Temperature", info="Controls randomness in text generation (0=deterministic, 1=creative)")
453
+ understand_max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=512, step=64, interactive=True,
454
+ label="Max New Tokens", info="Maximum length of generated text, including potential thinking")
455
+
456
+ img_understand_btn = gr.Button("Submit")
457
+
458
+ img_understand_btn.click(
459
+ fn=image_understanding,
460
+ inputs=[
461
+ img_input, understand_prompt, understand_show_thinking,
462
+ understand_do_sample, understand_text_temperature, understand_max_new_tokens
463
+ ],
464
+ outputs=txt_output
465
+ )
466
 
467
  demo.launch()