Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -36,8 +36,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
|
| 36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
-
os.system('pip install backoff')
|
| 40 |
-
|
| 41 |
# Global constants and helper functions
|
| 42 |
|
| 43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -259,7 +257,15 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
|
|
| 259 |
# ------------------------------------------------------------------------------
|
| 260 |
|
| 261 |
DESCRIPTION = """
|
| 262 |
-
# Agent Dino 🌠
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
"""
|
| 264 |
|
| 265 |
css = '''
|
|
@@ -469,7 +475,7 @@ def generate(
|
|
| 469 |
- "@web": triggers a web search or webpage visit.
|
| 470 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 471 |
- "@yolo": triggers object detection using YOLO.
|
| 472 |
-
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model
|
| 473 |
"""
|
| 474 |
text = input_dict["text"]
|
| 475 |
files = input_dict.get("files", [])
|
|
@@ -565,7 +571,7 @@ def generate(
|
|
| 565 |
yield gr.Image(result_img)
|
| 566 |
return
|
| 567 |
|
| 568 |
-
# --- Phi-4 Multimodal branch (Image/Audio)
|
| 569 |
if text.strip().lower().startswith("@phi4"):
|
| 570 |
question = text[len("@phi4"):].strip()
|
| 571 |
if not files:
|
|
@@ -574,15 +580,14 @@ def generate(
|
|
| 574 |
if not question:
|
| 575 |
yield "Error: Please provide a question after @phi4."
|
| 576 |
return
|
|
|
|
| 577 |
# Determine input type (Image or Audio) from the first file
|
| 578 |
input_file = files[0]
|
| 579 |
try:
|
| 580 |
-
# If file is already a PIL Image, treat as image
|
| 581 |
if isinstance(input_file, Image.Image):
|
| 582 |
input_type = "Image"
|
| 583 |
file_for_phi4 = input_file
|
| 584 |
else:
|
| 585 |
-
# Try opening as image; if it fails, assume audio
|
| 586 |
try:
|
| 587 |
file_for_phi4 = Image.open(input_file)
|
| 588 |
input_type = "Image"
|
|
@@ -592,7 +597,7 @@ def generate(
|
|
| 592 |
except Exception:
|
| 593 |
input_type = "Audio"
|
| 594 |
file_for_phi4 = input_file
|
| 595 |
-
|
| 596 |
if input_type == "Image":
|
| 597 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
| 598 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
@@ -603,19 +608,22 @@ def generate(
|
|
| 603 |
else:
|
| 604 |
yield "Invalid file type for @phi4 multimodal processing."
|
| 605 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
|
| 607 |
-
# Set up a streamer for the phi4 model
|
| 608 |
-
streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
|
| 609 |
-
generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
|
| 610 |
-
thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
|
| 611 |
-
thread_phi4.start()
|
| 612 |
-
|
| 613 |
-
outputs_phi4 = []
|
| 614 |
-
yield "🤔 Thinking..."
|
| 615 |
-
for new_text in streamer_phi4:
|
| 616 |
-
outputs_phi4.append(new_text)
|
| 617 |
-
yield "".join(outputs_phi4)
|
| 618 |
-
return
|
| 619 |
|
| 620 |
# --- Text and TTS branch ---
|
| 621 |
tts_prefix = "@tts"
|
|
@@ -705,16 +713,15 @@ demo = gr.ChatInterface(
|
|
| 705 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 706 |
],
|
| 707 |
examples=[
|
| 708 |
-
[{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
|
| 709 |
-
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
| 710 |
["@tts2 What causes rainbows to form?"],
|
| 711 |
["@image Chocolate dripping from a donut"],
|
| 712 |
["@3d A birthday cupcake with cherry"],
|
| 713 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 714 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 715 |
-
["@
|
| 716 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 717 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
|
| 718 |
],
|
| 719 |
cache_examples=False,
|
| 720 |
type="messages",
|
|
|
|
| 36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
|
|
|
|
|
|
| 39 |
# Global constants and helper functions
|
| 40 |
|
| 41 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 257 |
# ------------------------------------------------------------------------------
|
| 258 |
|
| 259 |
DESCRIPTION = """
|
| 260 |
+
# Agent Dino 🌠
|
| 261 |
+
This chatbot supports various commands:
|
| 262 |
+
- **@tts1 / @tts2:** text-to-speech
|
| 263 |
+
- **@image:** image generation
|
| 264 |
+
- **@3d:** 3D mesh generation
|
| 265 |
+
- **@web:** web search/visit
|
| 266 |
+
- **@rAgent:** reasoning chain
|
| 267 |
+
- **@yolo:** object detection
|
| 268 |
+
- **@phi4:** multimodal (image/audio) question answering
|
| 269 |
"""
|
| 270 |
|
| 271 |
css = '''
|
|
|
|
| 475 |
- "@web": triggers a web search or webpage visit.
|
| 476 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 477 |
- "@yolo": triggers object detection using YOLO.
|
| 478 |
+
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
|
| 479 |
"""
|
| 480 |
text = input_dict["text"]
|
| 481 |
files = input_dict.get("files", [])
|
|
|
|
| 571 |
yield gr.Image(result_img)
|
| 572 |
return
|
| 573 |
|
| 574 |
+
# --- Phi-4 Multimodal branch (Image/Audio) ---
|
| 575 |
if text.strip().lower().startswith("@phi4"):
|
| 576 |
question = text[len("@phi4"):].strip()
|
| 577 |
if not files:
|
|
|
|
| 580 |
if not question:
|
| 581 |
yield "Error: Please provide a question after @phi4."
|
| 582 |
return
|
| 583 |
+
|
| 584 |
# Determine input type (Image or Audio) from the first file
|
| 585 |
input_file = files[0]
|
| 586 |
try:
|
|
|
|
| 587 |
if isinstance(input_file, Image.Image):
|
| 588 |
input_type = "Image"
|
| 589 |
file_for_phi4 = input_file
|
| 590 |
else:
|
|
|
|
| 591 |
try:
|
| 592 |
file_for_phi4 = Image.open(input_file)
|
| 593 |
input_type = "Image"
|
|
|
|
| 597 |
except Exception:
|
| 598 |
input_type = "Audio"
|
| 599 |
file_for_phi4 = input_file
|
| 600 |
+
|
| 601 |
if input_type == "Image":
|
| 602 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
| 603 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
|
|
| 608 |
else:
|
| 609 |
yield "Invalid file type for @phi4 multimodal processing."
|
| 610 |
return
|
| 611 |
+
|
| 612 |
+
with torch.no_grad():
|
| 613 |
+
generate_ids = phi4_model.generate(
|
| 614 |
+
**inputs,
|
| 615 |
+
max_new_tokens=200,
|
| 616 |
+
num_logits_to_keep=0,
|
| 617 |
+
streamer=streamer # Adding text streamer
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
+
buffer = "⚛️ phi4 multimodal is initiated, hold tight"
|
| 621 |
+
for new_text in streamer:
|
| 622 |
+
buffer += new_text
|
| 623 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 624 |
+
time.sleep(0.01)
|
| 625 |
+
yield buffer
|
| 626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
# --- Text and TTS branch ---
|
| 629 |
tts_prefix = "@tts"
|
|
|
|
| 713 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 714 |
],
|
| 715 |
examples=[
|
|
|
|
|
|
|
| 716 |
["@tts2 What causes rainbows to form?"],
|
| 717 |
["@image Chocolate dripping from a donut"],
|
| 718 |
["@3d A birthday cupcake with cherry"],
|
| 719 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 720 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 721 |
+
["@rAgent Explain how a binary search algorithm works."],
|
| 722 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 723 |
["@tts1 Explain Tower of Hanoi"],
|
| 724 |
+
["@phi4 What is depicted in this image?"], # Example for @phi4
|
| 725 |
],
|
| 726 |
cache_examples=False,
|
| 727 |
type="messages",
|