PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 28

Commit

035efc4

verified ·

1 Parent(s): ace15c9

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -166

app.py CHANGED Viewed

@@ -39,9 +39,14 @@ from diffusers.utils import export_to_ply
 # Additional import for Phi-4 multimodality (audio support)
 import soundfile as sf
 os.system('pip install backoff')
-# Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -53,35 +58,30 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
 def glb_to_data_url(glb_path: str) -> str:
     """
     Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
-    (Not used in this method.)
     """
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-def get_file_path(file):
     """
-    Normalize a file input. If the input is a string, assume it is a file path.
-    Otherwise, if the object has a 'name' attribute or key, return that.
     """
     if isinstance(file, str):
-        return file
-    elif hasattr(file, "name"):
-        return file.name
-    elif isinstance(file, dict) and "name" in file:
-        return file["name"]
     else:
-        return None
-# Model class for Text-to-3D Generation (ShapE)
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
-        # Ensure the text encoder is in half precision to avoid dtype mismatches.
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
@@ -90,7 +90,6 @@ class Model:
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
-        # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
@@ -98,7 +97,6 @@ class Model:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
-        # Rotate the mesh for proper orientation
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
@@ -133,7 +131,7 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# New Tools for Web Functionality using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
@@ -141,7 +139,7 @@ import duckduckgo_search
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
-    description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
     inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
     output_type = "string"
@@ -151,24 +149,20 @@ class DuckDuckGoSearchTool(Tool):
         try:
             from duckduckgo_search import DDGS
         except ImportError as e:
-            raise ImportError(
-                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
-            ) from e
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
-            raise Exception("No results found! Try a less restrictive/shorter query.")
-        postprocessed_results = [
-            f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
-        ]
         return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
-    description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
-    inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
@@ -179,33 +173,23 @@ class VisitWebpageTool(Tool):
             import requests
             from markdownify import markdownify
             from requests.exceptions import RequestException
             from smolagents.utils import truncate_content
         except ImportError as e:
-            raise ImportError(
-                "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
-            ) from e
         try:
-            # Send a GET request to the URL with a 20-second timeout
             response = requests.get(url, timeout=20)
-            response.raise_for_status()  # Raise an exception for bad status codes
-            # Convert the HTML content to Markdown
             markdown_content = markdownify(response.text).strip()
-            # Remove multiple line breaks
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
-            return "The request timed out. Please try again later or check the URL."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
-            return f"An unexpected error occurred: {str(e)}"
-# rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
@@ -216,22 +200,17 @@ ragent_client = OpenAI(
 )
 SYSTEM_PROMPT = """
-        "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
-        "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
-        "2. **Code**: Write Python code to implement your solution.\n"
-        "3. **Observation**: Analyze the output of the code and summarize the results.\n"
-        "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
-        f"Task: {task}"
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
-    """
-    Uses the Llama mode OpenAI model to perform a structured reasoning chain.
-    """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Incorporate conversation history (if any)
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
@@ -252,17 +231,17 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
          response += token
          yield response
-# Gradio UI configuration
 DESCRIPTION = """
-# Agent Dino 🌠 """
 css = '''
 h1 {
   text-align: center;
   display: block;
 }
 #duplicate-button {
   margin: auto;
   color: #fff;
@@ -277,9 +256,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -289,13 +266,11 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# Voices for text-to-speech
 TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
 ]
-# Load multimodal processor and model (e.g. for OCR and image processing)
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -304,35 +279,23 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
-# Utility function to clean conversation history
 def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
-# Stable Diffusion XL Pipeline for Image Generation
-#Model In Use : SG161222/RealVisXL_V5.0_Lightning
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
@@ -341,18 +304,14 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     add_watermarker=False,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
@@ -372,10 +331,8 @@ def generate_image_fn(
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
@@ -388,9 +345,7 @@ def generate_image_fn(
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
-    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
@@ -405,8 +360,6 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# Text-to-3D Generation using the ShapE Pipeline
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
@@ -415,39 +368,28 @@ def generate_3d_fn(
     num_steps: int = 64,
     randomize_seed: bool = False,
 ):
-    """
-    Generate a 3D model from text using the ShapE pipeline.
-    Returns a tuple of (glb_file_path, used_seed).
-    """
     seed = int(randomize_seed_fn(seed, randomize_seed))
     model3d = Model()
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
 yolo_detector = YOLODetector(yolo_model_path)
 def detect_objects(image: np.ndarray):
-    """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
-# Phi-4 Multimodal Model Setup with Text Streaming
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
     phi4_model_path,
@@ -457,11 +399,10 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
     _attn_implementation="eager",
 )
-def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
     """
     Process an image or audio input with the Phi-4 multimodal model.
-    Uses a text streamer to yield incremental outputs.
-    Expects input_type to be either 'image' or 'audio'.
     """
     user_prompt = '<|user|>'
     assistant_prompt = '<|assistant|>'
@@ -471,24 +412,22 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
         yield "Please upload a file and provide a question."
         return
-    file_path = get_file_path(file)
-    if file_path is None:
-        yield "Could not determine the file path."
-        return
-    if input_type.lower() == "image":
-        prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
-        image = Image.open(file_path)
-        inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
-    elif input_type.lower() == "audio":
-        prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
-        audio, samplerate = sf.read(file_path)
-        inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
-    else:
-        yield "Invalid input type selected."
         return
-    # Setup text streamer using TextIteratorStreamer for incremental generation
     streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
@@ -501,8 +440,6 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
         time.sleep(0.01)
         yield buffer
-# Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -514,19 +451,54 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input and special commands:
-      - "@tts1" or "@tts2": triggers text-to-speech.
-      - "@image": triggers image generation using the SDXL pipeline.
-      - "@3d": triggers 3D model generation using the ShapE pipeline.
-      - "@web": triggers a web search or webpage visit.
-      - "@ragent": initiates a reasoning chain using Llama mode.
-      - "@yolo": triggers object detection using YOLO.
-      - **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
         yield "🌀 Hold tight, generating a 3D mesh GLB file....."
@@ -537,18 +509,15 @@ def generate(
             num_steps=64,
             randomize_seed=True,
         )
-        # Copy the GLB file to a static folder.
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
-    # --- Image Generation branch ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
@@ -568,7 +537,6 @@ def generate(
         yield gr.Image(image_paths[0])
         return
-    # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
@@ -585,7 +553,6 @@ def generate(
             yield results
         return
-    # --- rAgent Reasoning branch ---
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
@@ -593,7 +560,6 @@ def generate(
             yield partial
         return
-    # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
@@ -604,7 +570,7 @@ def generate(
             if isinstance(input_file, str):
                 pil_image = Image.open(input_file)
             else:
-                pil_image = Image.open(get_file_path(input_file))
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
@@ -613,28 +579,9 @@ def generate(
         yield gr.Image(result_img)
         return
-    # --- Phi-4 Multimodal branch with text streaming ---
-    if text.strip().lower().startswith("@phi4"):
-        parts = text.strip().split(maxsplit=2)
-        if len(parts) < 3:
-            yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
-            return
-        input_type = parts[1]
-        question = parts[2]
-        if not files or len(files) == 0:
-            yield "Error: Please attach an image or audio file for Phi-4 processing."
-            return
-        file_input = files[0]
-        yield "🔄 Processing multimodal input with Phi-4..."
-        for partial in process_phi4(input_type, file_input, question):
-            yield partial
-        return
-    # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -644,12 +591,11 @@ def generate(
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
-            images = [load_image(get_file_path(image)) for image in files]
         elif len(files) == 1:
-            images = [load_image(get_file_path(files[0]))]
         else:
             images = []
         messages = [{
@@ -665,7 +611,6 @@ def generate(
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Thinking..."
         for new_text in streamer:
@@ -693,21 +638,16 @@ def generate(
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# Gradio Chat Interface Setup and Launch
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -739,7 +679,6 @@ demo = gr.ChatInterface(
     multimodal=True,
 )
-# Ensure the static folder exists
 if not os.path.exists("static"):
     os.makedirs("static")

 # Additional import for Phi-4 multimodality (audio support)
 import soundfile as sf
+# Install additional dependencies if needed
 os.system('pip install backoff')
+# --- File validation constants ---
+IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
+AUDIO_EXTENSIONS = ['.wav', '.mp3', '.flac', '.ogg']
+# --- Global constants and helper functions ---
 MAX_SEED = np.iinfo(np.int32).max
 def glb_to_data_url(glb_path: str) -> str:
     """
     Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
     """
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
+def load_audio_file(file):
     """
+    Loads an audio file. If file is a string path, it reads directly.
+    Otherwise, assumes file is a file-like object.
     """
     if isinstance(file, str):
+        audio, samplerate = sf.read(file)
     else:
+        audio, samplerate = sf.read(BytesIO(file.read()))
+    return audio, samplerate
+# --- Model class for Text-to-3D Generation (ShapE) ---
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
+# --- New Tools for Web Functionality using DuckDuckGo and smolagents ---
 from typing import Any, Optional
 from smolagents.tools import Tool
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
+    description = "Performs a duckduckgo web search based on your query then returns the top search results."
     inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
     output_type = "string"
         try:
             from duckduckgo_search import DDGS
         except ImportError as e:
+            raise ImportError("Install duckduckgo-search via pip.") from e
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
+            raise Exception("No results found! Try a less restrictive query.")
+        postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
         return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
+    description = "Visits a webpage at the given URL and returns its content as markdown."
+    inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
             import requests
             from markdownify import markdownify
             from requests.exceptions import RequestException
             from smolagents.utils import truncate_content
         except ImportError as e:
+            raise ImportError("Install markdownify and requests via pip.") from e
         try:
             response = requests.get(url, timeout=20)
+            response.raise_for_status()
             markdown_content = markdownify(response.text).strip()
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
+            return "The request timed out. Please try again later."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
+            return f"Unexpected error: {str(e)}"
+# --- rAgent Reasoning using Llama mode OpenAI ---
 from openai import OpenAI
 )
 SYSTEM_PROMPT = """
+        "You are an expert assistant who solves tasks using Python code. Follow these steps:
+        1. Thought: Explain your reasoning and plan.
+        2. Code: Write Python code to implement your solution.
+        3. Observation: Analyze the output.
+        4. Final Answer: Provide a concise conclusion.
+        Task: {task}"
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
          response += token
          yield response
+# --- Gradio UI configuration ---
 DESCRIPTION = """
+# Agent Dino 🌠
+"""
 css = '''
 h1 {
   text-align: center;
   display: block;
 }
 #duplicate-button {
   margin: auto;
   color: #fff;
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# --- Load Models and Pipelines for Chat, Image, and Multimodal Processing ---
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
+    "en-US-JennyNeural",
+    "en-US-GuyNeural",
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     add_watermarker=False,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
     num_steps: int = 64,
     randomize_seed: bool = False,
 ):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     model3d = Model()
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
 yolo_detector = YOLODetector(yolo_model_path)
 def detect_objects(image: np.ndarray):
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
+# --- Phi-4 Multimodal Model Setup with Text Streaming ---
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
     phi4_model_path,
     _attn_implementation="eager",
 )
+def process_phi4(input_type: str, file: str, question: str, max_new_tokens: int = 200):
     """
     Process an image or audio input with the Phi-4 multimodal model.
+    Expects input_type to be either 'image' or 'audio' and file is a file path.
     """
     user_prompt = '<|user|>'
     assistant_prompt = '<|assistant|>'
         yield "Please upload a file and provide a question."
         return
+    try:
+        if input_type == "image":
+            prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
+            image = load_image(file)
+            inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
+        elif input_type == "audio":
+            prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
+            audio, samplerate = load_audio_file(file)
+            inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
+        else:
+            yield "Invalid input type selected. Use 'image' or 'audio'."
+            return
+    except Exception as e:
+        yield f"Error loading file: {str(e)}"
         return
     streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
         time.sleep(0.01)
         yield buffer
 @spaces.GPU
 def generate(
     input_dict: dict,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input and special commands.
+    Special commands include:
+      - "@tts1" or "@tts2": Text-to-speech.
+      - "@image": Image generation using the SDXL pipeline.
+      - "@3d": 3D model generation using the ShapE pipeline.
+      - "@web": Web search or webpage visit.
+      - "@ragent": Reasoning chain using Llama mode.
+      - "@yolo": Object detection using YOLO.
+      - "@phi4": Processes image or audio inputs with the Phi-4 model and streams text output.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # --- Phi-4 Multimodal branch with text streaming ---
+    if text.strip().lower().startswith("@phi4"):
+        parts = text.strip().split(maxsplit=2)
+        if len(parts) < 3:
+            yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
+            return
+        input_type = parts[1].lower()
+        question = parts[2]
+        if not files or len(files) == 0:
+            yield "Error: Please attach an image or audio file for Phi-4 processing."
+            return
+        if len(files) > 1:
+            yield "Warning: Multiple files attached. Only the first file will be processed."
+        file_input = files[0]  # This is a string path from gr.MultimodalTextbox
+        extension = os.path.splitext(file_input)[1].lower()
+        if input_type == "image" and extension not in IMAGE_EXTENSIONS:
+            yield f"Error: Attached file is not an image. Expected extensions: {', '.join(IMAGE_EXTENSIONS)}"
+            return
+        elif input_type == "audio" and extension not in AUDIO_EXTENSIONS:
+            yield f"Error: Attached file is not an audio file. Expected extensions: {', '.join(AUDIO_EXTENSIONS)}"
+            return
+        yield "🔄 Processing multimodal input with Phi-4..."
+        try:
+            for partial in process_phi4(input_type, file_input, question):
+                yield partial
+        except Exception as e:
+            yield f"Error processing file: {str(e)}"
+        return
+    # --- Other branches remain unchanged ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
         yield "🌀 Hold tight, generating a 3D mesh GLB file....."
             num_steps=64,
             randomize_seed=True,
         )
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
         yield gr.Image(image_paths[0])
         return
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             yield results
         return
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
             yield partial
         return
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
             if isinstance(input_file, str):
                 pil_image = Image.open(input_file)
             else:
+                pil_image = Image.open(input_file)
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
         yield gr.Image(result_img)
         return
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
+            images = [load_image(file) for file in files]
         elif len(files) == 1:
+            images = [load_image(files[0])]
         else:
             images = []
         messages = [{
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Thinking..."
         for new_text in streamer:
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
     multimodal=True,
 )
 if not os.path.exists("static"):
     os.makedirs("static")