mehdinathani's picture
Update app.py
7410d3d verified
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
import datetime
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
import gradio as gr
from PIL import Image
from transformers import pipeline
# -------------------------
# Set Up Speech Recognition Pipeline
# -------------------------
# This uses a Whisper model for automatic speech recognition.
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")
def convert_audio_to_text(audio_file):
"""
Convert an audio file to text using the ASR pipeline.
Args:
audio_file: The path or file-like object containing the recorded audio.
Returns:
The transcribed text.
"""
transcription = asr_pipeline(audio_file)["text"]
return transcription
# -------------------------
# Define Your Custom Tools
# -------------------------
@tool
def my_custom_tool(arg1: str, arg2: int) -> str:
"""A tool that does nothing yet.
Args:
arg1: the first argument used to trigger creative responses.
arg2: the second argument, an integer value for any purpose.
Returns:
A string prompting creativity.
"""
return "What magic will you build?"
@tool
def get_current_time_in_timezone(timezone: str) -> str:
"""Fetches the current local time in a specified timezone.
Args:
timezone: a valid timezone string (e.g., 'America/New_York').
Returns:
The current time as a formatted string or an error message.
"""
try:
tz = pytz.timezone(timezone)
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
return f"The current local time in {timezone} is: {local_time}"
except Exception as e:
return f"Error fetching time for timezone '{timezone}': {str(e)}"
# -------------------------
# Agent and Tool Setup
# -------------------------
final_answer = FinalAnswerTool()
model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Use an alternative endpoint if needed
custom_role_conversions=None,
)
# Load the image generation tool from the Hub.
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
tools = [
final_answer,
my_custom_tool,
get_current_time_in_timezone,
image_generation_tool,
]
agent = CodeAgent(
model=model,
tools=tools,
max_steps=6,
verbosity_level=1,
grammar=None,
planning_interval=None,
name="MyHuggingFaceAgent",
description="An agent with multiple tools for code generation and utility tasks.",
prompt_templates=prompt_templates
)
# ------------------------------------------------
# Function to Generate Image Based on Style
# ------------------------------------------------
def generate_image(prompt: str, style: str):
"""
Generate an image using the provided prompt and a selected style.
Args:
prompt (str): The user's image prompt.
style (str): The selected style option, e.g., "Logo", "Wallpaper", "General", "Natural", or "Drawing".
Returns:
A tuple containing the path to the generated image and a status message.
"""
if style == "Logo":
modified_prompt = f"{prompt}, logo design, minimalistic, vector art"
elif style == "Wallpaper":
modified_prompt = f"{prompt}, wallpaper, high resolution, elegant, modern"
elif style == "Natural":
modified_prompt = f"{prompt}, natural style, realistic, vibrant colors"
elif style == "Drawing":
modified_prompt = f"{prompt}, drawing, sketch, pencil drawing style, artistic"
else: # General
modified_prompt = prompt
image = image_generation_tool(prompt=modified_prompt)
if not isinstance(image, str):
image_path = "output_image.png"
image.save(image_path)
else:
image_path = image
status_message = f"Generated image with style: {style}"
return image_path, status_message
# ------------------------------------------------
# Combined Function for Text or Speech Input
# ------------------------------------------------
def generate_image_from_input(input_mode: str, text_prompt: str, audio_prompt, style: str):
"""
Generate an image using either a text prompt or a speech prompt.
Args:
input_mode (str): Either "Text" or "Speech".
text_prompt (str): The text prompt (used if input_mode is "Text").
audio_prompt: The audio recording (used if input_mode is "Speech").
style (str): The chosen style option.
Returns:
A tuple containing the generated image and a status message.
"""
if input_mode == "Speech":
try:
prompt_text = convert_audio_to_text(audio_prompt)
except Exception as e:
return None, f"Error in speech-to-text conversion: {str(e)}"
else:
prompt_text = text_prompt
return generate_image(prompt_text, style)
# ------------------------------------------------
# Gradio Interface
# ------------------------------------------------
interface = gr.Interface(
fn=generate_image_from_input,
inputs=[
gr.Radio(
label="Input Mode",
choices=["Text", "Speech"],
value="Text"
),
gr.Textbox(
label="Text Prompt",
placeholder="Enter your image prompt here",
value="A high-res, photorealistic image of a cat, sitting on a windowsill, looking outside."
),
gr.Audio(
type="filepath",
label="Speak Your Prompt (if Speech mode selected)"
),
gr.Radio(
label="Style Option",
choices=["Logo", "Wallpaper", "General", "Natural", "Drawing"],
value="General"
)
],
outputs=[
gr.Image(label="Generated Image"),
gr.Textbox(label="Status Message")
],
title="Customizable Image Generator with Voice Prompt Option",
description="Choose your input mode (Text or Speech), enter or record your prompt, select a style, and generate an image."
)
interface.launch()