|
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool |
|
import datetime |
|
import pytz |
|
import yaml |
|
from tools.final_answer import FinalAnswerTool |
|
import gradio as gr |
|
from PIL import Image |
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base") |
|
|
|
def convert_audio_to_text(audio_file): |
|
""" |
|
Convert an audio file to text using the ASR pipeline. |
|
|
|
Args: |
|
audio_file: The path or file-like object containing the recorded audio. |
|
|
|
Returns: |
|
The transcribed text. |
|
""" |
|
transcription = asr_pipeline(audio_file)["text"] |
|
return transcription |
|
|
|
|
|
|
|
|
|
|
|
@tool |
|
def my_custom_tool(arg1: str, arg2: int) -> str: |
|
"""A tool that does nothing yet. |
|
|
|
Args: |
|
arg1: the first argument used to trigger creative responses. |
|
arg2: the second argument, an integer value for any purpose. |
|
|
|
Returns: |
|
A string prompting creativity. |
|
""" |
|
return "What magic will you build?" |
|
|
|
@tool |
|
def get_current_time_in_timezone(timezone: str) -> str: |
|
"""Fetches the current local time in a specified timezone. |
|
|
|
Args: |
|
timezone: a valid timezone string (e.g., 'America/New_York'). |
|
|
|
Returns: |
|
The current time as a formatted string or an error message. |
|
""" |
|
try: |
|
tz = pytz.timezone(timezone) |
|
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S") |
|
return f"The current local time in {timezone} is: {local_time}" |
|
except Exception as e: |
|
return f"Error fetching time for timezone '{timezone}': {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
final_answer = FinalAnswerTool() |
|
|
|
model = HfApiModel( |
|
max_tokens=2096, |
|
temperature=0.5, |
|
model_id='Qwen/Qwen2.5-Coder-32B-Instruct', |
|
custom_role_conversions=None, |
|
) |
|
|
|
|
|
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True) |
|
|
|
with open("prompts.yaml", 'r') as stream: |
|
prompt_templates = yaml.safe_load(stream) |
|
|
|
tools = [ |
|
final_answer, |
|
my_custom_tool, |
|
get_current_time_in_timezone, |
|
image_generation_tool, |
|
] |
|
|
|
agent = CodeAgent( |
|
model=model, |
|
tools=tools, |
|
max_steps=6, |
|
verbosity_level=1, |
|
grammar=None, |
|
planning_interval=None, |
|
name="MyHuggingFaceAgent", |
|
description="An agent with multiple tools for code generation and utility tasks.", |
|
prompt_templates=prompt_templates |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def generate_image(prompt: str, style: str): |
|
""" |
|
Generate an image using the provided prompt and a selected style. |
|
|
|
Args: |
|
prompt (str): The user's image prompt. |
|
style (str): The selected style option, e.g., "Logo", "Wallpaper", "General", "Natural", or "Drawing". |
|
|
|
Returns: |
|
A tuple containing the path to the generated image and a status message. |
|
""" |
|
if style == "Logo": |
|
modified_prompt = f"{prompt}, logo design, minimalistic, vector art" |
|
elif style == "Wallpaper": |
|
modified_prompt = f"{prompt}, wallpaper, high resolution, elegant, modern" |
|
elif style == "Natural": |
|
modified_prompt = f"{prompt}, natural style, realistic, vibrant colors" |
|
elif style == "Drawing": |
|
modified_prompt = f"{prompt}, drawing, sketch, pencil drawing style, artistic" |
|
else: |
|
modified_prompt = prompt |
|
|
|
image = image_generation_tool(prompt=modified_prompt) |
|
if not isinstance(image, str): |
|
image_path = "output_image.png" |
|
image.save(image_path) |
|
else: |
|
image_path = image |
|
|
|
status_message = f"Generated image with style: {style}" |
|
return image_path, status_message |
|
|
|
|
|
|
|
|
|
|
|
def generate_image_from_input(input_mode: str, text_prompt: str, audio_prompt, style: str): |
|
""" |
|
Generate an image using either a text prompt or a speech prompt. |
|
|
|
Args: |
|
input_mode (str): Either "Text" or "Speech". |
|
text_prompt (str): The text prompt (used if input_mode is "Text"). |
|
audio_prompt: The audio recording (used if input_mode is "Speech"). |
|
style (str): The chosen style option. |
|
|
|
Returns: |
|
A tuple containing the generated image and a status message. |
|
""" |
|
if input_mode == "Speech": |
|
try: |
|
prompt_text = convert_audio_to_text(audio_prompt) |
|
except Exception as e: |
|
return None, f"Error in speech-to-text conversion: {str(e)}" |
|
else: |
|
prompt_text = text_prompt |
|
|
|
return generate_image(prompt_text, style) |
|
|
|
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=generate_image_from_input, |
|
inputs=[ |
|
gr.Radio( |
|
label="Input Mode", |
|
choices=["Text", "Speech"], |
|
value="Text" |
|
), |
|
gr.Textbox( |
|
label="Text Prompt", |
|
placeholder="Enter your image prompt here", |
|
value="A high-res, photorealistic image of a cat, sitting on a windowsill, looking outside." |
|
), |
|
gr.Audio( |
|
type="filepath", |
|
label="Speak Your Prompt (if Speech mode selected)" |
|
), |
|
gr.Radio( |
|
label="Style Option", |
|
choices=["Logo", "Wallpaper", "General", "Natural", "Drawing"], |
|
value="General" |
|
) |
|
], |
|
outputs=[ |
|
gr.Image(label="Generated Image"), |
|
gr.Textbox(label="Status Message") |
|
], |
|
title="Customizable Image Generator with Voice Prompt Option", |
|
description="Choose your input mode (Text or Speech), enter or record your prompt, select a style, and generate an image." |
|
) |
|
|
|
interface.launch() |
|
|