from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool import datetime import pytz import yaml from tools.final_answer import FinalAnswerTool import gradio as gr from PIL import Image from transformers import pipeline # ------------------------- # Set Up Speech Recognition Pipeline # ------------------------- # This uses a Whisper model for automatic speech recognition. asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base") def convert_audio_to_text(audio_file): """ Convert an audio file to text using the ASR pipeline. Args: audio_file: The path or file-like object containing the recorded audio. Returns: The transcribed text. """ transcription = asr_pipeline(audio_file)["text"] return transcription # ------------------------- # Define Your Custom Tools # ------------------------- @tool def my_custom_tool(arg1: str, arg2: int) -> str: """A tool that does nothing yet. Args: arg1: the first argument used to trigger creative responses. arg2: the second argument, an integer value for any purpose. Returns: A string prompting creativity. """ return "What magic will you build?" @tool def get_current_time_in_timezone(timezone: str) -> str: """Fetches the current local time in a specified timezone. Args: timezone: a valid timezone string (e.g., 'America/New_York'). Returns: The current time as a formatted string or an error message. """ try: tz = pytz.timezone(timezone) local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S") return f"The current local time in {timezone} is: {local_time}" except Exception as e: return f"Error fetching time for timezone '{timezone}': {str(e)}" # ------------------------- # Agent and Tool Setup # ------------------------- final_answer = FinalAnswerTool() model = HfApiModel( max_tokens=2096, temperature=0.5, model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Use an alternative endpoint if needed custom_role_conversions=None, ) # Load the image generation tool from the Hub. image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True) with open("prompts.yaml", 'r') as stream: prompt_templates = yaml.safe_load(stream) tools = [ final_answer, my_custom_tool, get_current_time_in_timezone, image_generation_tool, ] agent = CodeAgent( model=model, tools=tools, max_steps=6, verbosity_level=1, grammar=None, planning_interval=None, name="MyHuggingFaceAgent", description="An agent with multiple tools for code generation and utility tasks.", prompt_templates=prompt_templates ) # ------------------------------------------------ # Function to Generate Image Based on Style # ------------------------------------------------ def generate_image(prompt: str, style: str): """ Generate an image using the provided prompt and a selected style. Args: prompt (str): The user's image prompt. style (str): The selected style option, e.g., "Logo", "Wallpaper", "General", "Natural", or "Drawing". Returns: A tuple containing the path to the generated image and a status message. """ if style == "Logo": modified_prompt = f"{prompt}, logo design, minimalistic, vector art" elif style == "Wallpaper": modified_prompt = f"{prompt}, wallpaper, high resolution, elegant, modern" elif style == "Natural": modified_prompt = f"{prompt}, natural style, realistic, vibrant colors" elif style == "Drawing": modified_prompt = f"{prompt}, drawing, sketch, pencil drawing style, artistic" else: # General modified_prompt = prompt image = image_generation_tool(prompt=modified_prompt) if not isinstance(image, str): image_path = "output_image.png" image.save(image_path) else: image_path = image status_message = f"Generated image with style: {style}" return image_path, status_message # ------------------------------------------------ # Combined Function for Text or Speech Input # ------------------------------------------------ def generate_image_from_input(input_mode: str, text_prompt: str, audio_prompt, style: str): """ Generate an image using either a text prompt or a speech prompt. Args: input_mode (str): Either "Text" or "Speech". text_prompt (str): The text prompt (used if input_mode is "Text"). audio_prompt: The audio recording (used if input_mode is "Speech"). style (str): The chosen style option. Returns: A tuple containing the generated image and a status message. """ if input_mode == "Speech": try: prompt_text = convert_audio_to_text(audio_prompt) except Exception as e: return None, f"Error in speech-to-text conversion: {str(e)}" else: prompt_text = text_prompt return generate_image(prompt_text, style) # ------------------------------------------------ # Gradio Interface # ------------------------------------------------ interface = gr.Interface( fn=generate_image_from_input, inputs=[ gr.Radio( label="Input Mode", choices=["Text", "Speech"], value="Text" ), gr.Textbox( label="Text Prompt", placeholder="Enter your image prompt here", value="A high-res, photorealistic image of a cat, sitting on a windowsill, looking outside." ), gr.Audio( type="filepath", label="Speak Your Prompt (if Speech mode selected)" ), gr.Radio( label="Style Option", choices=["Logo", "Wallpaper", "General", "Natural", "Drawing"], value="General" ) ], outputs=[ gr.Image(label="Generated Image"), gr.Textbox(label="Status Message") ], title="Customizable Image Generator with Voice Prompt Option", description="Choose your input mode (Text or Speech), enter or record your prompt, select a style, and generate an image." ) interface.launch()