# app.py import sys import os import numpy as np import librosa from scipy.io import wavfile sys.path.append(os.path.dirname(os.path.abspath(__file__))) import gradio as gr from agent import NewsReporterAgent, AgentState, add_messages from langchain_core.messages import HumanMessage, AIMessage ####################################### # Version check import torch import transformers import gradio # Now you can check the versions print("--- 🔍 CHECKING LIBRARY VERSIONS 🔍 ---") print(f"PyTorch version: {torch.__version__}") print(f"Transformers version: {transformers.__version__}") print(f"Gradio version: {gradio.__version__}") print("------------------------------------") ######################################### # --- 1. Initialize the Agent --- # This loads the model once when the app starts. agent = NewsReporterAgent() # --- 2. Define Gradio Logic Handlers --- # These functions orchestrate the agent's actions based on UI events. def run_initial_generation(audio_path, image_path): """Handles the first step: processing inputs and generating the initial report.""" if not audio_path and not image_path: return "Please provide an audio or image file.", None, gr.update(visible=False), None, None, None state = AgentState(audio_path=audio_path, image_path=image_path, news_report=[]) state.update(agent.transcribe_audio(state)) state.update(agent.describe_image(state)) state.update(agent.create_report(state)) latest_report = state["news_report"][-1].content transcribed_text = state.get('transcribed_text') or "No audio was provided to transcribe." image_description = state.get('image_description') or "No image was provided to describe." return latest_report, state, gr.update(visible=True), "", transcribed_text, image_description def run_revision(feedback, current_state): """Handles the revision step based on user feedback.""" if not feedback or not feedback.strip(): # Re-populate UI fields if feedback is empty latest_report = next((msg.content for msg in reversed(current_state["news_report"]) if isinstance(msg, AIMessage)), "") transcribed_text = current_state.get('transcribed_text', "") image_description = current_state.get('image_description', "") return latest_report, current_state, "Please provide feedback.", transcribed_text, image_description current_state["news_report"] = add_messages(current_state["news_report"], [HumanMessage(content=feedback)]) current_state.update(agent.revise_report(current_state)) latest_report = current_state["news_report"][-1].content transcribed_text = current_state.get('transcribed_text') or "No audio was provided." image_description = current_state.get('image_description') or "No image was provided." return latest_report, current_state, "", transcribed_text, image_description def run_save(current_state): """Handles the save step.""" save_update = agent.save_report(current_state) return save_update["final_message"] # --- 3. Define the Gradio UI --- # ------------------------------------------------------------------ # Build examples: audio-only, image-only, and combined # ------------------------------------------------------------------ # Define known file extensions AUDIO_EXTENSIONS = ['.wav', '.mp3', '.m4a', '.flac'] IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.webp', '.gif'] example_list = [] examples_dir = "examples" if os.path.isdir(examples_dir): audio_files = sorted( f for f in os.listdir(examples_dir) if any(f.lower().endswith(ext) for ext in AUDIO_EXTENSIONS) ) image_files = sorted( f for f in os.listdir(examples_dir) if any(f.lower().endswith(ext) for ext in IMAGE_EXTENSIONS) ) # 1) audio-only for af in audio_files: example_list.append([os.path.join(examples_dir, af), None]) # 2) image-only for imf in image_files: example_list.append([None, os.path.join(examples_dir, imf)]) # 3) audio + image (pair first audio with first image, etc.) for af, imf in zip(audio_files, image_files): example_list.append([os.path.join(examples_dir, af), os.path.join(examples_dir, imf)]) with gr.Blocks(theme=gr.themes.Soft(), title="Multimodal News Reporter") as demo: agent_state = gr.State(value=None) gr.Markdown("# 📰 Multimodal News Reporter AI") gr.Markdown( "- Upload an audio recording and/or a relevant image; the AI will generate a news report you can revise and save.\n" "- Token output is set to 128 only for faster inference. \n" "- Note: This demo currently runs on CPU only.\n" "- Sample audio is trimmed to 10 seconds for faster inference.\n" "- Combined audio + image inference takes ~250-350 seconds; audio-only or image-only is much faster." ) with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(label="Audio Interview Evidence", type="filepath") image_input = gr.Image(label="Image Evidence", type="filepath") generate_btn = gr.Button("📝 Generate Initial Report", variant="primary") # Examples gr.Examples( examples=example_list, inputs=[audio_input, image_input], label="Click an example to test" ) with gr.Column(scale=2): report_output = gr.Textbox(label="Generated News Report", lines=12, interactive=False) status_output = gr.Markdown(value="") with gr.Accordion("Show Source Information", open=False): transcribed_audio_output = gr.Textbox(label="🎤 Transcribed Audio", interactive=False, lines=5) image_description_output = gr.Textbox(label="🖼️ Image Description", interactive=False, lines=5) with gr.Group(visible=False) as revision_group: gr.Markdown("### ✍️ Provide Feedback for Revision") feedback_input = gr.Textbox(label="Your Feedback", placeholder="e.g., 'Make the tone more formal.'") with gr.Row(): revise_btn = gr.Button("🔄 Revise Report") save_btn = gr.Button("💾 Save Final Report") # --- 4. Wire UI Components to Logic Handlers --- generate_btn.click( fn=run_initial_generation, inputs=[audio_input, image_input], outputs=[report_output, agent_state, revision_group, status_output, transcribed_audio_output, image_description_output] ) revise_btn.click( fn=run_revision, inputs=[feedback_input, agent_state], outputs=[report_output, agent_state, status_output, transcribed_audio_output, image_description_output] ).then(fn=lambda: "", outputs=[feedback_input]) save_btn.click( fn=run_save, inputs=[agent_state], outputs=[status_output] ) # --- 5. Launch the App --- if __name__ == "__main__": demo.launch(debug=True)