TEXT_TO_IMAGE_MEHDINATHANI

Sleeping

App Files Files Community

TEXT_TO_IMAGE_MEHDINATHANI / app.py

mehdinathani

Update app.py

7410d3d verified 6 months ago

raw

history blame contribute delete

6.3 kB

	from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
	import datetime
	import pytz
	import yaml
	from tools.final_answer import FinalAnswerTool
	import gradio as gr
	from PIL import Image
	from transformers import pipeline

	# -------------------------
	# Set Up Speech Recognition Pipeline
	# -------------------------
	# This uses a Whisper model for automatic speech recognition.
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")

	def convert_audio_to_text(audio_file):
	"""
	Convert an audio file to text using the ASR pipeline.

	Args:
	audio_file: The path or file-like object containing the recorded audio.

	Returns:
	The transcribed text.
	"""
	transcription = asr_pipeline(audio_file)["text"]
	return transcription

	# -------------------------
	# Define Your Custom Tools
	# -------------------------

	@tool
	def my_custom_tool(arg1: str, arg2: int) -> str:
	"""A tool that does nothing yet.

	Args:
	arg1: the first argument used to trigger creative responses.
	arg2: the second argument, an integer value for any purpose.

	Returns:
	A string prompting creativity.
	"""
	return "What magic will you build?"

	@tool
	def get_current_time_in_timezone(timezone: str) -> str:
	"""Fetches the current local time in a specified timezone.

	Args:
	timezone: a valid timezone string (e.g., 'America/New_York').

	Returns:
	The current time as a formatted string or an error message.
	"""
	try:
	tz = pytz.timezone(timezone)
	local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
	return f"The current local time in {timezone} is: {local_time}"
	except Exception as e:
	return f"Error fetching time for timezone '{timezone}': {str(e)}"

	# -------------------------
	# Agent and Tool Setup
	# -------------------------

	final_answer = FinalAnswerTool()

	model = HfApiModel(
	max_tokens=2096,
	temperature=0.5,
	model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Use an alternative endpoint if needed
	custom_role_conversions=None,
	)

	# Load the image generation tool from the Hub.
	image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

	with open("prompts.yaml", 'r') as stream:
	prompt_templates = yaml.safe_load(stream)

	tools = [
	final_answer,
	my_custom_tool,
	get_current_time_in_timezone,
	image_generation_tool,
	]

	agent = CodeAgent(
	model=model,
	tools=tools,
	max_steps=6,
	verbosity_level=1,
	grammar=None,
	planning_interval=None,
	name="MyHuggingFaceAgent",
	description="An agent with multiple tools for code generation and utility tasks.",
	prompt_templates=prompt_templates
	)

	# ------------------------------------------------
	# Function to Generate Image Based on Style
	# ------------------------------------------------

	def generate_image(prompt: str, style: str):
	"""
	Generate an image using the provided prompt and a selected style.

	Args:
	prompt (str): The user's image prompt.
	style (str): The selected style option, e.g., "Logo", "Wallpaper", "General", "Natural", or "Drawing".

	Returns:
	A tuple containing the path to the generated image and a status message.
	"""
	if style == "Logo":
	modified_prompt = f"{prompt}, logo design, minimalistic, vector art"
	elif style == "Wallpaper":
	modified_prompt = f"{prompt}, wallpaper, high resolution, elegant, modern"
	elif style == "Natural":
	modified_prompt = f"{prompt}, natural style, realistic, vibrant colors"
	elif style == "Drawing":
	modified_prompt = f"{prompt}, drawing, sketch, pencil drawing style, artistic"
	else: # General
	modified_prompt = prompt

	image = image_generation_tool(prompt=modified_prompt)
	if not isinstance(image, str):
	image_path = "output_image.png"
	image.save(image_path)
	else:
	image_path = image

	status_message = f"Generated image with style: {style}"
	return image_path, status_message

	# ------------------------------------------------
	# Combined Function for Text or Speech Input
	# ------------------------------------------------

	def generate_image_from_input(input_mode: str, text_prompt: str, audio_prompt, style: str):
	"""
	Generate an image using either a text prompt or a speech prompt.

	Args:
	input_mode (str): Either "Text" or "Speech".
	text_prompt (str): The text prompt (used if input_mode is "Text").
	audio_prompt: The audio recording (used if input_mode is "Speech").
	style (str): The chosen style option.

	Returns:
	A tuple containing the generated image and a status message.
	"""
	if input_mode == "Speech":
	try:
	prompt_text = convert_audio_to_text(audio_prompt)
	except Exception as e:
	return None, f"Error in speech-to-text conversion: {str(e)}"
	else:
	prompt_text = text_prompt

	return generate_image(prompt_text, style)

	# ------------------------------------------------
	# Gradio Interface
	# ------------------------------------------------

	interface = gr.Interface(
	fn=generate_image_from_input,
	inputs=[
	gr.Radio(
	label="Input Mode",
	choices=["Text", "Speech"],
	value="Text"
	),
	gr.Textbox(
	label="Text Prompt",
	placeholder="Enter your image prompt here",
	value="A high-res, photorealistic image of a cat, sitting on a windowsill, looking outside."
	),
	gr.Audio(
	type="filepath",
	label="Speak Your Prompt (if Speech mode selected)"
	),
	gr.Radio(
	label="Style Option",
	choices=["Logo", "Wallpaper", "General", "Natural", "Drawing"],
	value="General"
	)
	],
	outputs=[
	gr.Image(label="Generated Image"),
	gr.Textbox(label="Status Message")
	],
	title="Customizable Image Generator with Voice Prompt Option",
	description="Choose your input mode (Text or Speech), enter or record your prompt, select a style, and generate an image."
	)

	interface.launch()