from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
import datetime
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
import gradio as gr
from PIL import Image
from transformers import pipeline

# -------------------------
# Set Up Speech Recognition Pipeline
# -------------------------
# This uses a Whisper model for automatic speech recognition.
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def convert_audio_to_text(audio_file):
    """
    Convert an audio file to text using the ASR pipeline.
    
    Args:
        audio_file: The path or file-like object containing the recorded audio.
        
    Returns:
        The transcribed text.
    """
    transcription = asr_pipeline(audio_file)["text"]
    return transcription

# -------------------------
# Define Your Custom Tools
# -------------------------

@tool
def my_custom_tool(arg1: str, arg2: int) -> str:
    """A tool that does nothing yet.
    
    Args:
        arg1: the first argument used to trigger creative responses.
        arg2: the second argument, an integer value for any purpose.
    
    Returns:
        A string prompting creativity.
    """
    return "What magic will you build?"

@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """Fetches the current local time in a specified timezone.
    
    Args:
        timezone: a valid timezone string (e.g., 'America/New_York').
    
    Returns:
        The current time as a formatted string or an error message.
    """
    try:
        tz = pytz.timezone(timezone)
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"

# -------------------------
# Agent and Tool Setup
# -------------------------

final_answer = FinalAnswerTool()

model = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # Use an alternative endpoint if needed
    custom_role_conversions=None,
)

# Load the image generation tool from the Hub.
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)

tools = [
    final_answer,
    my_custom_tool,
    get_current_time_in_timezone,
    image_generation_tool,
]

agent = CodeAgent(
    model=model,
    tools=tools,
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name="MyHuggingFaceAgent",
    description="An agent with multiple tools for code generation and utility tasks.",
    prompt_templates=prompt_templates
)

# ------------------------------------------------
# Function to Generate Image Based on Style
# ------------------------------------------------

def generate_image(prompt: str, style: str):
    """
    Generate an image using the provided prompt and a selected style.
    
    Args:
        prompt (str): The user's image prompt.
        style (str): The selected style option, e.g., "Logo", "Wallpaper", "General", "Natural", or "Drawing".
    
    Returns:
        A tuple containing the path to the generated image and a status message.
    """
    if style == "Logo":
        modified_prompt = f"{prompt}, logo design, minimalistic, vector art"
    elif style == "Wallpaper":
        modified_prompt = f"{prompt}, wallpaper, high resolution, elegant, modern"
    elif style == "Natural":
        modified_prompt = f"{prompt}, natural style, realistic, vibrant colors"
    elif style == "Drawing":
        modified_prompt = f"{prompt}, drawing, sketch, pencil drawing style, artistic"
    else:  # General
        modified_prompt = prompt

    image = image_generation_tool(prompt=modified_prompt)
    if not isinstance(image, str):
        image_path = "output_image.png"
        image.save(image_path)
    else:
        image_path = image

    status_message = f"Generated image with style: {style}"
    return image_path, status_message

# ------------------------------------------------
# Combined Function for Text or Speech Input
# ------------------------------------------------

def generate_image_from_input(input_mode: str, text_prompt: str, audio_prompt, style: str):
    """
    Generate an image using either a text prompt or a speech prompt.
    
    Args:
        input_mode (str): Either "Text" or "Speech".
        text_prompt (str): The text prompt (used if input_mode is "Text").
        audio_prompt: The audio recording (used if input_mode is "Speech").
        style (str): The chosen style option.
    
    Returns:
        A tuple containing the generated image and a status message.
    """
    if input_mode == "Speech":
        try:
            prompt_text = convert_audio_to_text(audio_prompt)
        except Exception as e:
            return None, f"Error in speech-to-text conversion: {str(e)}"
    else:
        prompt_text = text_prompt

    return generate_image(prompt_text, style)

# ------------------------------------------------
# Gradio Interface
# ------------------------------------------------

interface = gr.Interface(
    fn=generate_image_from_input,
    inputs=[
        gr.Radio(
            label="Input Mode", 
            choices=["Text", "Speech"], 
            value="Text"
        ),
        gr.Textbox(
            label="Text Prompt", 
            placeholder="Enter your image prompt here",
            value="A high-res, photorealistic image of a cat, sitting on a windowsill, looking outside."
        ),
        gr.Audio(
            type="filepath", 
            label="Speak Your Prompt (if Speech mode selected)"
        ),
        gr.Radio(
            label="Style Option",
            choices=["Logo", "Wallpaper", "General", "Natural", "Drawing"],
            value="General"
        )
    ],
    outputs=[
        gr.Image(label="Generated Image"),
        gr.Textbox(label="Status Message")
    ],
    title="Customizable Image Generator with Voice Prompt Option",
    description="Choose your input mode (Text or Speech), enter or record your prompt, select a style, and generate an image."
)

interface.launch()