PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 28

Commit

7c9f5e8

verified ·

1 Parent(s): 40825af

Update app.py

Browse files

Files changed (1) hide show

app.py +287 -154

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import numpy as np
 from PIL import Image
 import edge_tts
 import trimesh
-import soundfile as sf  # Added for audio processing with Phi-4
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
@@ -46,6 +46,10 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     return seed
 def glb_to_data_url(glb_path: str) -> str:
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
@@ -58,6 +62,7 @@ class Model:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
@@ -66,6 +71,7 @@ class Model:
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
@@ -73,6 +79,7 @@ class Model:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
@@ -107,7 +114,7 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# Web Tools using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
@@ -115,20 +122,25 @@ import duckduckgo_search
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
-    description = "Performs a duckduckgo web search and returns the top results."
-    inputs = {'query': {'type': 'string', 'description': 'The search query.'}}
     output_type = "string"
     def __init__(self, max_results=10, **kwargs):
         super().__init__()
         self.max_results = max_results
-        from duckduckgo_search import DDGS
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
-            raise Exception("No results found! Try a less restrictive query.")
         postprocessed_results = [
             f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
         ]
@@ -136,28 +148,44 @@ class DuckDuckGoSearchTool(Tool):
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
-    description = "Visits a webpage and returns its content as markdown."
-    inputs = {'url': {'type': 'string', 'description': 'The URL to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
     def forward(self, url: str) -> str:
-        import requests
-        from markdownify import markdownify
-        from smolagents.utils import truncate_content
         try:
             response = requests.get(url, timeout=20)
-            response.raise_for_status()
             markdown_content = markdownify(response.text).strip()
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
-        except requests.exceptions.Timeout:
-            return "The request timed out."
-        except requests.exceptions.RequestException as e:
-            return f"Error fetching webpage: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
@@ -169,15 +197,22 @@ ragent_client = OpenAI(
 )
 SYSTEM_PROMPT = """
-"You are an expert assistant who solves tasks using Python code. Follow these steps:
-1. **Thought**: Explain your reasoning and plan.
-2. **Code**: Write Python code to implement your solution.
-3. **Observation**: Analyze the output and summarize results.
-4. **Final Answer**: Provide a concise conclusion."
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
@@ -186,23 +221,76 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
-        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-        messages=messages,
     )
     for message in stream:
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-# Load Models
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Text-only model
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -212,8 +300,14 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# Multimodal model (Qwen2-VL)
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -221,58 +315,55 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Phi-4 Multimodal Model
-phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
-phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
-phi4_model = AutoModelForCausalLM.from_pretrained(
-    phi4_model_path,
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-    _attn_implementation="eager",
-)
-phi4_model.eval()
-# Stable Diffusion XL Pipeline
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
-sd_pipe = StableDiffusionXLPipeline.from_pretrained(
-    MODEL_ID_SD,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    use_safetensors=True,
-    add_watermarker=False,
-).to(device)
-sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-if torch.cuda.is_available():
-    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-# YOLO Object Detection
-YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
-YOLO_CHECKPOINT_NAME = "images/demo.pt"
-yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
-yolo_detector = YOLODetector(yolo_model_path)
-# TTS Voices
-TTS_VOICES = ["en-US-JennyNeural", "en-US-GuyNeural"]
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# Utility Functions
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
@@ -292,8 +383,10 @@ def generate_image_fn(
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
@@ -306,12 +399,14 @@ def generate_image_fn(
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
-    for i in range(0, num_images, 1):  # Simplified batching
         batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+1]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"]:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+1]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -321,6 +416,8 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
@@ -329,22 +426,36 @@ def generate_3d_fn(
     num_steps: int = 64,
     randomize_seed: bool = False,
 ):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     model3d = Model()
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 def detect_objects(image: np.ndarray):
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
-# Chat Generation Function with @phi4 Added
 @spaces.GPU
 def generate(
@@ -356,13 +467,23 @@ def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # --- 3D Generation ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
-        yield "🌀 Generating 3D mesh GLB file..."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
@@ -370,31 +491,41 @@ def generate(
             num_steps=64,
             randomize_seed=True,
         )
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
-    # --- Image Generation ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             seed=1,
             randomize_seed=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return
-    # --- Web Search/Visit ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
@@ -402,30 +533,36 @@ def generate(
             content = visitor.forward(url)
             yield content
         else:
             query = web_command
-            yield "🧤 Performing web search..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
-    # --- rAgent Reasoning ---
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
-        yield "📝 Initiating reasoning chain..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
-    # --- YOLO Object Detection ---
     if text.strip().lower().startswith("@yolo"):
-        yield "🔍 Running object detection..."
         if not files or len(files) == 0:
-            yield "Error: Please attach an image for YOLO."
             return
         input_file = files[0]
         try:
-            pil_image = Image.open(input_file)
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
@@ -434,63 +571,64 @@ def generate(
         yield gr.Image(result_img)
         return
-    # --- Phi-4 Multimodal Branch ---
     if text.strip().lower().startswith("@phi4"):
-        parts = text[len("@phi4"):].strip().split(maxsplit=1)
-        if len(parts) < 2:
-            yield "Error: Specify input type and question, e.g., '@phi4 image What is this?'"
             return
-        input_type = parts[0].lower()
-        question = parts[1]
-        if input_type not in ["image", "audio"]:
-            yield "Error: Input type must be 'image' or 'audio'."
             return
-        if not files or len(files) == 0:
-            yield "Error: Please attach a file for Phi-4 processing."
             return
-        if len(files) > 1:
-            yield "Warning: Multiple files attached. Using the first one."
-        file_input = files[0]
-        try:
-            if input_type == "image":
-                prompt = f'<|user|><|image_1|>{question}<|end|><|assistant|>'
-                image = Image.open(file_input)
-                inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
-            elif input_type == "audio":
-                prompt = f'<|user|><|audio_1|>{question}<|end|><|assistant|>'
-                audio, samplerate = sf.read(file_input)
-                inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
-            streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
-            generation_kwargs = {
                 **inputs,
-                "streamer": streamer,
-                "max_new_tokens": max_new_tokens,
-            }
-            thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
-            thread.start()
-            buffer = ""
-            yield "🤔 Thinking..."
-            for new_text in streamer:
-                buffer += new_text
-                buffer = buffer.replace("<|im_end|>", "")
-                time.sleep(0.01)
-                yield buffer
-        except Exception as e:
-            yield f"Error processing file: {str(e)}"
         return
-    # --- Text and TTS Branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -502,7 +640,12 @@ def generate(
         conversation.append({"role": "user", "content": text})
     if files:
-        images = [load_image(image) for image in files]
         messages = [{
             "role": "user",
             "content": [
@@ -528,7 +671,7 @@ def generate(
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input to {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
@@ -557,24 +700,14 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# Gradio Interface
-DESCRIPTION = """
-# Agent Dino 🌠
-Multimodal chatbot with text, image, audio, 3D generation, web search, reasoning, and object detection.
-"""
-css = '''
-h1 { text-align: center; }
-#duplicate-button { margin: auto; color: #fff; background: #1565c0; border-radius: 100vh; }
-'''
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
@@ -585,10 +718,9 @@ demo = gr.ChatInterface(
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
         ["@rAgent Explain how a binary search algorithm works."],
-        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning?"],
         ["@tts1 Explain Tower of Hanoi"],
-        [{"text": "@phi4 image What is shown in this image?", "files": ["examples/image.jpg"]}],
-        [{"text": "@phi4 audio Transcribe this audio.", "files": ["examples/audio.wav"]}],
     ],
     cache_examples=False,
     type="messages",
@@ -596,15 +728,16 @@ demo = gr.ChatInterface(
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
-        label="Query Input",
         file_types=["image", "audio"],
-        file_count="multiple",
-        placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @phi4-multimodal, default-{text gen}{image-text-text}",
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if not os.path.exists("static"):
     os.makedirs("static")

 from PIL import Image
 import edge_tts
 import trimesh
+import soundfile as sf  # New import for audio file reading
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
     return seed
 def glb_to_data_url(glb_path: str) -> str:
+    """
+    Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
+    (Not used in this method.)
+    """
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
+        # Ensure the text encoder is in half precision to avoid dtype mismatches.
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
+        # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
+        # Rotate the mesh for proper orientation
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
+# New Tools for Web Functionality using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
+    description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
+    inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
     output_type = "string"
     def __init__(self, max_results=10, **kwargs):
         super().__init__()
         self.max_results = max_results
+        try:
+            from duckduckgo_search import DDGS
+        except ImportError as e:
+            raise ImportError(
+                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
+            ) from e
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
+            raise Exception("No results found! Try a less restrictive/shorter query.")
         postprocessed_results = [
             f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
         ]
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
+    description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
+    inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
     def forward(self, url: str) -> str:
         try:
+            import requests
+            from markdownify import markdownify
+            from requests.exceptions import RequestException
+            from smolagents.utils import truncate_content
+        except ImportError as e:
+            raise ImportError(
+                "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
+            ) from e
+        try:
+            # Send a GET request to the URL with a 20-second timeout
             response = requests.get(url, timeout=20)
+            response.raise_for_status()  # Raise an exception for bad status codes
+            # Convert the HTML content to Markdown
             markdown_content = markdownify(response.text).strip()
+            # Remove multiple line breaks
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
+        except requests.exceptions.Timeout:
+            return "The request timed out. Please try again later or check the URL."
+        except RequestException as e:
+            return f"Error fetching the webpage: {str(e)}"
+        except Exception as e:
+            return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 )
 SYSTEM_PROMPT = """
+        "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
+        "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
+        "2. **Code**: Write Python code to implement your solution.\n"
+        "3. **Observation**: Analyze the output of the code and summarize the results.\n"
+        "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
+        f"Task: {task}"
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
+    """
+    Uses the Llama mode OpenAI model to perform a structured reasoning chain.
+    """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Incorporate conversation history (if any)
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
+         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+         max_tokens=max_tokens,
+         stream=True,
+         temperature=temperature,
+         top_p=top_p,
+         messages=messages,
     )
     for message in stream:
+         token = message.choices[0].delta.content
+         response += token
+         yield response
+# ------------------------------------------------------------------------------
+# New Phi-4 Multimodal Feature (Image & Audio)
+# ------------------------------------------------------------------------------
+# Define prompt structure for Phi-4
+phi4_user_prompt = '<|user|>'
+phi4_assistant_prompt = '<|assistant|>'
+phi4_prompt_suffix = '<|end|>'
+# Load Phi-4 multimodal model and processor using unique variable names
+phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
+phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
+phi4_model = AutoModelForCausalLM.from_pretrained(
+    phi4_model_path,
+    device_map="auto",
+    torch_dtype="auto",
+    trust_remote_code=True,
+    _attn_implementation="eager",
+)
+# ------------------------------------------------------------------------------
+# Gradio UI configuration
+# ------------------------------------------------------------------------------
+DESCRIPTION = """
+# Agent Dino 🌠
+This chatbot supports various commands:
+- **@tts1 / @tts2:** text-to-speech
+- **@image:** image generation
+- **@3d:** 3D mesh generation
+- **@web:** web search/visit
+- **@rAgent:** reasoning chain
+- **@yolo:** object detection
+- **@phi4:** multimodal (image/audio) question answering
+"""
+css = '''
+h1 {
+  text-align: center;
+  display: block;
+}
+#duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+'''
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Models and Pipelines for Chat, Image, and Multimodal Processing
+# Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# Voices for text-to-speech
+TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
+]
+# Load multimodal processor and model (e.g. for OCR and image processing)
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
+# Utility function to clean conversation history
 def clean_chat_history(chat_history):
+    """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
+    """
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
+# Stable Diffusion XL Pipeline for Image Generation
+# Model In Use : SG161222/RealVisXL_V5.0_Lightning
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
+sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+    MODEL_ID_SD,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    use_safetensors=True,
+    add_watermarker=False,
+).to(device)
+sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+if torch.cuda.is_available():
+    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+if USE_TORCH_COMPILE:
+    sd_pipe.compile()
+if ENABLE_CPU_OFFLOAD:
+    sd_pipe.enable_model_cpu_offload()
 def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
+    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
+    # Process in batches
+    for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# Text-to-3D Generation using the ShapE Pipeline
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
     num_steps: int = 64,
     randomize_seed: bool = False,
 ):
+    """
+    Generate a 3D model from text using the ShapE pipeline.
+    Returns a tuple of (glb_file_path, used_seed).
+    """
     seed = int(randomize_seed_fn(seed, randomize_seed))
     model3d = Model()
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
+# YOLO Object Detection Setup
+YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
+YOLO_CHECKPOINT_NAME = "images/demo.pt"
+yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
+yolo_detector = YOLODetector(yolo_model_path)
 def detect_objects(image: np.ndarray):
+    """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
+    """
+    Generates chatbot responses with support for multimodal input and special commands:
+      - "@tts1" or "@tts2": triggers text-to-speech.
+      - "@image": triggers image generation using the SDXL pipeline.
+      - "@3d": triggers 3D model generation using the ShapE pipeline.
+      - "@web": triggers a web search or webpage visit.
+      - "@rAgent": initiates a reasoning chain using Llama mode.
+      - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
+    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
+        yield "🌀 Hold tight, generating a 3D mesh GLB file....."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
             num_steps=64,
             randomize_seed=True,
         )
+        # Copy the GLB file to a static folder.
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
+    # --- Image Generation branch ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
+            negative_prompt="",
+            use_negative_prompt=False,
             seed=1,
+            width=1024,
+            height=1024,
+            guidance_scale=3,
+            num_inference_steps=25,
             randomize_seed=True,
+            use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return
+    # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
+        # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
             content = visitor.forward(url)
             yield content
         else:
+            # Otherwise, treat the rest as a search query.
             query = web_command
+            yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
+    # --- rAgent Reasoning branch ---
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
+        yield "📝 Initiating reasoning chain using Llama mode..."
+        # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
+    # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
+        yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
+            yield "Error: Please attach an image for YOLO object detection."
             return
+        # Use the first attached image
         input_file = files[0]
         try:
+            if isinstance(input_file, str):
+                pil_image = Image.open(input_file)
+            else:
+                pil_image = input_file
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
         yield gr.Image(result_img)
         return
+    # --- Phi-4 Multimodal branch (Image/Audio) ---
     if text.strip().lower().startswith("@phi4"):
+        question = text[len("@phi4"):].strip()
+        if not files:
+            yield "Error: Please attach an image or audio file for @phi4 multimodal processing."
             return
+        if not question:
+            yield "Error: Please provide a question after @phi4."
             return
+        # Determine input type (Image or Audio) from the first file
+        input_file = files[0]
+        try:
+            # If file is already a PIL Image, treat as image
+            if isinstance(input_file, Image.Image):
+                input_type = "Image"
+                file_for_phi4 = input_file
+            else:
+                # Try opening as image; if it fails, assume audio
+                try:
+                    file_for_phi4 = Image.open(input_file)
+                    input_type = "Image"
+                except Exception:
+                    input_type = "Audio"
+                    file_for_phi4 = input_file
+        except Exception:
+            input_type = "Audio"
+            file_for_phi4 = input_file
+        if input_type == "Image":
+            phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
+            inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
+        elif input_type == "Audio":
+            phi4_prompt = f'{phi4_user_prompt}<|audio_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
+            audio, samplerate = sf.read(file_for_phi4)
+            inputs = phi4_processor(text=phi4_prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
+        else:
+            yield "Invalid file type for @phi4 multimodal processing."
             return
+        with torch.no_grad():
+            generate_ids = phi4_model.generate(
                 **inputs,
+                max_new_tokens=200,
+                num_logits_to_keep=0,
+            )
+        input_length = inputs['input_ids'].shape[1]
+        generate_ids = generate_ids[:, input_length:]
+        response = phi4_processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        yield response
         return
+    # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation.append({"role": "user", "content": text})
     if files:
+        if len(files) > 1:
+            images = [load_image(image) for image in files]
+        elif len(files) == 1:
+            images = [load_image(files[0])]
+        else:
+            images = []
         messages = [{
             "role": "user",
             "content": [
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
+# Gradio Chat Interface Setup and Launch
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
         ["@rAgent Explain how a binary search algorithm works."],
+        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
+        ["@phi4 What is depicted in this image?"],  # Example for @phi4
     ],
     cache_examples=False,
     type="messages",
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
+        label="Query Input",
         file_types=["image", "audio"],
+        file_count="multiple",
+        placeholder="@tts1, @tts2, @image, @3d, @phi4, @rAgent, @web, @yolo, or plain text"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
+# Ensure the static folder exists
 if not os.path.exists("static"):
     os.makedirs("static")