prithivMLmods commited on
Commit
a165b0a
Β·
verified Β·
1 Parent(s): e685e73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -90
app.py CHANGED
@@ -5,11 +5,10 @@ import json
5
  import time
6
  import asyncio
7
  import tempfile
 
8
  import base64
9
  import shutil
10
  import re
11
- import gc
12
- from threading import Thread
13
 
14
  import gradio as gr
15
  import spaces
@@ -34,10 +33,7 @@ from transformers.image_utils import load_image
34
 
35
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
- from diffusers.utils import export_to_ply, export_to_video
38
-
39
- # NEW IMPORTS FOR TEXT-TO-VIDEO
40
- from diffusers import LTXPipeline, LTXImageToVideoPipeline
41
 
42
  # Global constants and helper functions
43
 
@@ -92,7 +88,7 @@ class Model:
92
  return mesh_path.name
93
 
94
  def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
95
- generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
96
  images = self.pipe(
97
  prompt,
98
  generator=generator,
@@ -105,7 +101,7 @@ class Model:
105
  return self.to_glb(ply_path.name)
106
 
107
  def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
108
- generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
109
  images = self.pipe_img(
110
  image,
111
  generator=generator,
@@ -239,9 +235,7 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
239
  # Gradio UI configuration
240
 
241
  DESCRIPTION = """
242
- # Agent Dino 🌠
243
- Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
244
- """
245
 
246
  css = '''
247
  h1 {
@@ -410,64 +404,6 @@ def generate_3d_fn(
410
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
411
  return glb_path, seed
412
 
413
- # ---------------------------
414
- # NEW: Text-to-Video Generation
415
- # ---------------------------
416
-
417
- # Initialize text-to-video pipeline
418
- t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
419
- t2v_pipe.to(device)
420
-
421
- def get_time_cost(run_task_time, time_cost_str):
422
- now_time = int(time.time() * 1000)
423
- if run_task_time == 0:
424
- time_cost_str = 'start'
425
- else:
426
- if time_cost_str != '':
427
- time_cost_str += f'-->'
428
- time_cost_str += f'{now_time - run_task_time}'
429
- run_task_time = now_time
430
- return run_task_time, time_cost_str
431
-
432
- @spaces.GPU(duration=60)
433
- def text_to_video(
434
- prompt: str,
435
- negative_prompt: str,
436
- width: int = 768,
437
- height: int = 512,
438
- num_frames: int = 121,
439
- frame_rate: int = 25,
440
- num_inference_steps: int = 30,
441
- seed: int = 8,
442
- progress: gr.Progress = gr.Progress(),
443
- ):
444
- generator = torch.Generator(device=device).manual_seed(seed)
445
- run_task_time = 0
446
- time_cost_str = ''
447
- run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
448
- try:
449
- with torch.no_grad():
450
- video = t2v_pipe(
451
- prompt=prompt,
452
- negative_prompt=negative_prompt,
453
- generator=generator,
454
- width=width,
455
- height=height,
456
- num_frames=num_frames,
457
- num_inference_steps=num_inference_steps,
458
- ).frames[0]
459
- finally:
460
- torch.cuda.empty_cache()
461
- gc.collect()
462
- run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
463
-
464
- output_path = tempfile.mktemp(suffix=".mp4")
465
- export_to_video(video, output_path, fps=frame_rate)
466
-
467
- del video
468
- torch.cuda.empty_cache()
469
- return output_path, time_cost_str
470
-
471
  # YOLO Object Detection Setup
472
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
473
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
@@ -488,7 +424,7 @@ def detect_objects(image: np.ndarray):
488
 
489
  return Image.fromarray(annotated_image)
490
 
491
- # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @text2video commands
492
 
493
  @spaces.GPU
494
  def generate(
@@ -508,7 +444,6 @@ def generate(
508
  - "@web": triggers a web search or webpage visit.
509
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
510
  - "@yolo": triggers object detection using YOLO.
511
- - "@text2video": triggers text-to-video generation.
512
  """
513
  text = input_dict["text"]
514
  files = input_dict.get("files", [])
@@ -604,23 +539,6 @@ def generate(
604
  yield gr.Image(result_img)
605
  return
606
 
607
- # --- Text-to-Video Generation branch ---
608
- if text.strip().lower().startswith("@text2video"):
609
- # Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
610
- command_body = text[len("@text2video"):].strip()
611
- if "||" in command_body:
612
- prompt_text, negative_prompt = command_body.split("||", 1)
613
- prompt_text = prompt_text.strip()
614
- negative_prompt = negative_prompt.strip()
615
- else:
616
- prompt_text = command_body
617
- negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
618
- yield "🎞️ Generating video..."
619
- video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
620
- yield gr.Video(video_path)
621
- yield f"Time cost by step (ms): {time_cost_str}"
622
- return
623
-
624
  # --- Text and TTS branch ---
625
  tts_prefix = "@tts"
626
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -717,14 +635,13 @@ demo = gr.ChatInterface(
717
  ["@rAgent Explain how a binary search algorithm works."],
718
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
719
  ["@tts1 Explain Tower of Hanoi"],
720
- ["@text2video A futuristic cityscape at dusk"],
721
  ],
722
  cache_examples=False,
723
  type="messages",
724
  description=DESCRIPTION,
725
  css=css,
726
  fill_height=True,
727
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-β™‚, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @text2video-video gen, default-{text gen}{image-text-text}"),
728
  stop_btn="Stop Generation",
729
  multimodal=True,
730
  )
 
5
  import time
6
  import asyncio
7
  import tempfile
8
+ from threading import Thread
9
  import base64
10
  import shutil
11
  import re
 
 
12
 
13
  import gradio as gr
14
  import spaces
 
33
 
34
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
35
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
36
+ from diffusers.utils import export_to_ply
 
 
 
37
 
38
  # Global constants and helper functions
39
 
 
88
  return mesh_path.name
89
 
90
  def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
91
+ generator = torch.Generator(device=self.device).manual_seed(seed)
92
  images = self.pipe(
93
  prompt,
94
  generator=generator,
 
101
  return self.to_glb(ply_path.name)
102
 
103
  def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
104
+ generator = torch.Generator(device=self.device).manual_seed(seed)
105
  images = self.pipe_img(
106
  image,
107
  generator=generator,
 
235
  # Gradio UI configuration
236
 
237
  DESCRIPTION = """
238
+ # Agent Dino 🌠 """
 
 
239
 
240
  css = '''
241
  h1 {
 
404
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
405
  return glb_path, seed
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  # YOLO Object Detection Setup
408
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
409
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
 
424
 
425
  return Image.fromarray(annotated_image)
426
 
427
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
428
 
429
  @spaces.GPU
430
  def generate(
 
444
  - "@web": triggers a web search or webpage visit.
445
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
446
  - "@yolo": triggers object detection using YOLO.
 
447
  """
448
  text = input_dict["text"]
449
  files = input_dict.get("files", [])
 
539
  yield gr.Image(result_img)
540
  return
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # --- Text and TTS branch ---
543
  tts_prefix = "@tts"
544
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
 
635
  ["@rAgent Explain how a binary search algorithm works."],
636
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
637
  ["@tts1 Explain Tower of Hanoi"],
 
638
  ],
639
  cache_examples=False,
640
  type="messages",
641
  description=DESCRIPTION,
642
  css=css,
643
  fill_height=True,
644
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-β™‚, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
645
  stop_btn="Stop Generation",
646
  multimodal=True,
647
  )