import requests import base64 import io import json import gradio as gr from gradio import Text import base64 import numpy as np from pydub import AudioSegment # Define the API endpoint URL url = "https://ruslanmv-hf-llm-api-collection.hf.space/tts" # Set headers for content type and desired response format headers = { "Content-Type": "application/json", "accept": "application/json" # May need adjustment if endpoint doesn't support JSON } def convert_text_to_base64(text, language="en"): """Converts text to base64 encoded audio string using the provided API. Args: text (str): The text to convert to speech. language (str, optional): The language code for the speech (default: "en"). Returns: str: The base64 encoded audio string on success, None on error. """ try: # Prepare the data data = { "input_text": text, "from_language": language } # Send the POST request response = requests.post(url, headers=headers, json=data) # Check for successful response if response.status_code == 200: try: # Check for JSON response format first response_data = response.json() # Check for errors in the response (if JSON) if "detail" in response_data: print(f"Error: {response_data['detail']}") return None # Extract audio data from the response (assuming it's in a field) audio_data = response_data.get("audio", None) if not audio_data: print("Error: Missing audio data in response.") return None except json.JSONDecodeError: # If not JSON, assume raw binary data audio_data = response.content # Use an in-memory buffer with io.BytesIO() as buffer: # Write audio data to the buffer buffer.write(audio_data) # Encode audio data to base64 string base64_encoded_str = base64.b64encode(buffer.getvalue()).decode("utf-8") return base64_encoded_str else: print(f"Error: {response.status_code}") return None except Exception as e: print(f"Error: {e}") return None def get_audio_properties(audio_data): try: # Try to read as WAV audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="wav") format = "wav" except: try: # Try to read as MP3 audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") format = "mp3" except Exception as e: raise ValueError(f"Unknown audio format: {e}") duration = len(audio_segment) / 1000.0 # duration in seconds bitrate = audio_segment.frame_rate channels = audio_segment.channels sample_width = audio_segment.sample_width return { "format": format, "duration": duration, "bitrate": bitrate, "channels": channels, "sample_width": sample_width, "audio_segment": audio_segment } def play_audio(text): """Converts text to speech using the provided API and plays the audio.""" base64_encoded_audio = convert_text_to_base64(text) if base64_encoded_audio: # Decode base64 string to bytes (assuming known format) # Decode the base64 string audio_data = base64.b64decode(base64_encoded_audio) # Get audio properties properties = get_audio_properties(audio_data) print("Audio Properties:", properties) # Convert audio segment to numpy array audio_segment = properties["audio_segment"] samples = np.array(audio_segment.get_array_of_samples()) if audio_segment.channels == 2: samples = samples.reshape((-1, 2)) # Create the audio component with controls and optional download button return 24000, samples else: return "Error occurred during conversion." # Define the Gradio interface with clear labels for user interaction interface = gr.Interface( fn=play_audio, title="Text to Speech API", # Add a title to the interface description="Developed by Ruslan Magana, visit ruslanmv.com for more information.", inputs=Text(label="Enter text to convert to speech"), outputs=gr.Audio(label="Generated audio", type="numpy"), #live=True # Enable live updates ) # Launch the Gradio interface interface.launch()