import gradio as gr import json import tempfile import os import re # For parsing conversation from typing import Union, Optional, Dict, Tuple # Import Dict and Tuple # Import the actual functions from synthgen from synthgen import ( generate_synthetic_text, generate_prompts, generate_synthetic_conversation ) # We no longer need to import api_key here or check it directly in app.py # --- Helper Functions for JSON Generation --- # Use Union for Python < 3.10 compatibility def create_json_file(data: object, base_filename: str) -> Union[str, None]: """Creates a temporary JSON file and returns its path.""" try: # Create a temporary file with a .json extension with tempfile.NamedTemporaryFile(mode='w', suffix=".json", delete=False, encoding='utf-8') as temp_file: json.dump(data, temp_file, indent=4, ensure_ascii=False) return temp_file.name # Return the path to the temporary file except Exception as e: print(f"Error creating JSON file {base_filename}: {e}") return None def parse_conversation_string(text: str) -> list[dict]: """Parses a multi-line conversation string into a list of message dictionaries.""" messages = [] # Regex to capture "User:" or "Assistant:" at the start of a line, followed by content pattern = re.compile(r"^(User|Assistant):\s*(.*)$", re.IGNORECASE | re.MULTILINE) matches = pattern.finditer(text) for match in matches: role = match.group(1).lower() content = match.group(2).strip() messages.append({"role": role, "content": content}) # If parsing fails or format is unexpected, return raw text in a single message? # Or return empty list? Let's return what we found. if not messages and text: # If regex found nothing but text exists print(f"Warning: Could not parse conversation structure for: '{text[:100]}...'") # Fallback: return the whole text as a single assistant message? Or user? # Let's return a generic system message indicating the raw content # return [{"role": "system", "content": f"Unparsed conversation text: {text}"}] # Or maybe just return empty, TBD based on preference pass # Return empty list if parsing fails for now return messages # Wrapper for text generation (remains largely the same, but error handling is improved in synthgen) def run_generation(prompt: str, model: str, num_samples: int) -> str: """ Wrapper function for Gradio interface to generate multiple text samples. Relies on generate_synthetic_text for API calls and error handling. """ if not prompt: return "Error: Please enter a prompt." if num_samples <= 0: return "Error: Number of samples must be positive." output = f"Generating {num_samples} samples using model '{model}'...\n" output += "="*20 + "\n\n" # generate_synthetic_text now handles API errors internally for i in range(num_samples): # The function returns the text or an error string starting with "Error:" generated_text = generate_synthetic_text(prompt, model) output += f"--- Sample {i+1} ---\n" output += generated_text + "\n\n" # Append result directly output += "="*20 + "\nGeneration complete (check results above for errors)." return output # Removed the placeholder backend functions (generate_prompts_backend, generate_single_conversation) # Modified function to handle multiple conversation prompts using the real backend def run_conversation_generation(system_prompts_text: str, model: str, num_turns: int) -> str: """ Wrapper function for Gradio interface to generate multiple conversations based on a list of prompts, calling generate_synthetic_conversation. """ if not system_prompts_text: return "Error: Please enter or generate at least one system prompt/topic." if num_turns <= 0: return "Error: Number of turns must be positive." prompts = [p.strip() for p in system_prompts_text.strip().split('\n') if p.strip()] if not prompts: return "Error: No valid prompts found in the input." output = f"Generating {len(prompts)} conversations ({num_turns} turns each) using model '{model}'...\n" output += "="*40 + "\n\n" for i, prompt in enumerate(prompts): # Call the actual function from synthgen.py # It handles API calls and returns the conversation or an error string. conversation_text = generate_synthetic_conversation(prompt, model, num_turns) # We don't need a try-except here because the function itself returns error strings # The title is now included within the returned string from the function output += f"--- Conversation {i+1}/{len(prompts)} ---\n" output += conversation_text + "\n\n" # Append result directly output += "="*40 + "\nGeneration complete (check results above for errors)." return output # Helper function for the Gradio UI to generate prompts using the real backend def generate_prompts_ui( num_prompts: int, model: str, temperature: float, # Add settings top_p: float, max_tokens: int ) -> str: """UI Wrapper to call the generate_prompts backend and format for Textbox.""" # Handle optional settings temp_val = temperature if temperature > 0 else None top_p_val = top_p if 0 < top_p <= 1 else None # Use a specific max_tokens for prompt generation or pass from UI? Let's pass from UI max_tokens_val = max_tokens if max_tokens > 0 else 200 # Set a default if UI value is 0 if not model: return "Error: Please select a model for prompt generation." if num_prompts <= 0: return "Error: Number of prompts to generate must be positive." if num_prompts > 50: return "Error: Cannot generate more than 50 prompts at a time." print(f"Generating prompts with settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val}") # Debug print try: # Call the actual function from synthgen.py, passing settings prompts_list = generate_prompts( num_prompts, model, temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val ) return "\n".join(prompts_list) except ValueError as e: # Catch errors raised by generate_prompts (e.g., API errors, parsing errors) return f"Error generating prompts: {e}" except Exception as e: # Catch any other unexpected errors print(f"Unexpected error in generate_prompts_ui: {e}") return f"An unexpected error occurred: {e}" # --- Modified Generation Wrappers --- # Wrapper for text generation + JSON preparation - RETURNS TUPLE def run_generation_and_prepare_json( prompt: str, model: str, num_samples: int, temperature: float, top_p: float, max_tokens: int ) -> Tuple[gr.update, gr.update]: # Return type hint (optional) """Generates text samples and prepares a JSON file for download.""" # Handle optional settings temp_val = temperature if temperature > 0 else None top_p_val = top_p if 0 < top_p <= 1 else None max_tokens_val = max_tokens if max_tokens > 0 else None # Handle errors by returning updates for both outputs in a tuple if not prompt: return (gr.update(value="Error: Please enter a prompt."), gr.update(value=None)) if num_samples <= 0: return (gr.update(value="Error: Number of samples must be positive."), gr.update(value=None)) output_str = f"Generating {num_samples} samples using model '{model}'...\n" output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" output_str += "="*20 + "\n\n" results_list = [] for i in range(num_samples): generated_text = generate_synthetic_text( prompt, model, temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val ) output_str += f"--- Sample {i+1} ---\n" output_str += generated_text + "\n\n" if not generated_text.startswith("Error:"): results_list.append(generated_text) output_str += "="*20 + "\nGeneration complete (check results above for errors)." json_filepath = create_json_file(results_list, "text_samples.json") # Return tuple of updates in the order of outputs list return (gr.update(value=output_str), gr.update(value=json_filepath)) # Wrapper for conversation generation + JSON preparation - RETURNS TUPLE def run_conversation_generation_and_prepare_json( system_prompts_text: str, model: str, num_turns: int, temperature: float, top_p: float, max_tokens: int ) -> Tuple[gr.update, gr.update]: # Return type hint (optional) """Generates conversations and prepares a JSON file for download.""" temp_val = temperature if temperature > 0 else None top_p_val = top_p if 0 < top_p <= 1 else None max_tokens_val = max_tokens if max_tokens > 0 else None # Handle errors by returning updates for both outputs in a tuple if not system_prompts_text: return (gr.update(value="Error: Please enter or generate at least one system prompt/topic."), gr.update(value=None)) if num_turns <= 0: return (gr.update(value="Error: Number of turns must be positive."), gr.update(value=None)) prompts = [p.strip() for p in system_prompts_text.strip().split('\n') if p.strip()] if not prompts: return (gr.update(value="Error: No valid prompts found in the input."), gr.update(value=None)) output_str = f"Generating {len(prompts)} conversations ({num_turns} turns each) using model '{model}'...\n" output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" output_str += "="*40 + "\n\n" results_list_structured = [] for i, prompt in enumerate(prompts): conversation_text = generate_synthetic_conversation( prompt, model, num_turns, temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val ) output_str += f"--- Conversation {i+1}/{len(prompts)} ---\n" output_str += conversation_text + "\n\n" # --- Parsing Logic --- core_conversation_text = conversation_text if conversation_text.startswith("Error:"): core_conversation_text = None elif "\n\n" in conversation_text: parts = conversation_text.split("\n\n", 1) core_conversation_text = parts[1] if len(parts) > 1 else conversation_text if core_conversation_text: messages = parse_conversation_string(core_conversation_text) if messages: results_list_structured.append({"prompt": prompt, "messages": messages}) else: results_list_structured.append({"prompt": prompt, "error": "Failed to parse structure.", "raw_text": core_conversation_text}) elif conversation_text.startswith("Error:"): results_list_structured.append({"prompt": prompt, "error": conversation_text}) else: results_list_structured.append({"prompt": prompt, "error": "Could not extract content.", "raw_text": conversation_text}) # --- End Parsing Logic --- output_str += "="*40 + "\nGeneration complete (check results above for errors)." json_filepath = create_json_file(results_list_structured, "conversations.json") # Return tuple of updates in the order of outputs list return (gr.update(value=output_str), gr.update(value=json_filepath)) # --- Gradio Interface Definition --- with gr.Blocks() as demo: gr.Markdown("# Synthetic Data Generator using OpenRouter") gr.Markdown( "Generate synthetic text samples or conversations using various models" ) # Removed the api_key_loaded check and warning Markdown # Define model choices (can be shared or specific per tab) # Consider fetching these dynamically from OpenRouter if possible in the future model_choices = [ "deepseek/deepseek-chat-v3-0324:free", # Example free model "meta-llama/llama-3.3-70b-instruct:free", "deepseek/deepseek-r1:free", "google/gemini-2.5-pro-exp-03-25:free", "qwen/qwen-2.5-72b-instruct:free", "featherless/qwerky-72b:free", "google/gemma-3-27b-it:free", "mistralai/mistral-small-24b-instruct-2501:free", "deepseek/deepseek-r1-distill-llama-70b:free", "sophosympatheia/rogue-rose-103b-v0.2:free", "nvidia/llama-3.1-nemotron-70b-instruct:free", "microsoft/phi-3-medium-128k-instruct:free", "undi95/toppy-m-7b:free", "huggingfaceh4/zephyr-7b-beta:free", "openrouter/quasar-alpha" # Add more model IDs as needed ] default_model = model_choices[0] if model_choices else None # --- Shared Model Settings --- # Use an Accordion for less clutter with gr.Accordion("Model Settings (Optional)", open=False): # Set reasonable ranges and defaults. Use 0 for Max Tokens/Top-P to signify 'None'/API default. temperature_slider = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Controls randomness. Higher values are more creative, lower are more deterministic. 0 means use API default.") top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Top-P (Nucleus Sampling)", info="Considers only tokens with cumulative probability mass >= top_p. 0 means use API default.") max_tokens_slider = gr.Number(value=0, minimum=0, maximum=8192, step=64, label="Max Tokens", info="Maximum number of tokens to generate in the completion. 0 means use API default.") with gr.Tabs(): with gr.TabItem("Text Generation"): with gr.Row(): prompt_input_text = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (e.g., Generate a short product description for a sci-fi gadget)", lines=3) with gr.Row(): model_input_text = gr.Dropdown( label="OpenRouter Model ID", choices=model_choices, value=default_model ) num_samples_input_text = gr.Number(label="Number of Samples", value=3, minimum=1, maximum=20, step=1) generate_button_text = gr.Button("Generate Text Samples") output_text = gr.Textbox(label="Generated Samples", lines=15, show_copy_button=True) # Add File component for download download_file_text = gr.File(label="Download Samples as JSON") generate_button_text.click( fn=run_generation_and_prepare_json, inputs=[ prompt_input_text, model_input_text, num_samples_input_text, temperature_slider, top_p_slider, max_tokens_slider # Add settings inputs ], outputs=[output_text, download_file_text] ) with gr.TabItem("Conversation Generation"): gr.Markdown("Enter one system prompt/topic per line below, or use the 'Generate Prompts' button.") with gr.Row(): # Textbox for multiple prompts prompt_input_conv = gr.Textbox( label="Prompts (one per line)", lines=5, # Make it multi-line placeholder="Enter prompts here, one per line...\ne.g., Act as a pirate discussing treasure maps.\nDiscuss the future of space travel." ) with gr.Row(): # Input for number of prompts to generate num_prompts_input_conv = gr.Number(label="Number of Prompts to Generate", value=5, minimum=1, maximum=20, step=1) # Keep max reasonable # Button to trigger AI prompt generation generate_prompts_button = gr.Button("Generate Prompts using AI") with gr.Row(): # Model selection for conversation generation AND prompt generation model_input_conv = gr.Dropdown( label="OpenRouter Model ID (for generation)", choices=model_choices, value=default_model ) with gr.Row(): # Input for number of turns per conversation num_turns_input_conv = gr.Number(label="Number of Turns per Conversation (approx)", value=5, minimum=1, maximum=20, step=1) # Keep max reasonable # Button to generate the conversations based on the prompts in the Textbox generate_conversations_button = gr.Button("Generate Conversations") output_conv = gr.Textbox(label="Generated Conversations", lines=15, show_copy_button=True) # Add File component for download download_file_conv = gr.File(label="Download Conversations as JSON") # Connect the "Generate Prompts" button to the UI wrapper generate_prompts_button.click( fn=generate_prompts_ui, # Use the wrapper that calls the real function inputs=[ num_prompts_input_conv, model_input_conv, temperature_slider, top_p_slider, max_tokens_slider # Add settings inputs ], outputs=prompt_input_conv ) # Connect the "Generate Conversations" button to the real function wrapper generate_conversations_button.click( fn=run_conversation_generation_and_prepare_json, # Use the wrapper that calls the real function inputs=[ prompt_input_conv, model_input_conv, num_turns_input_conv, temperature_slider, top_p_slider, max_tokens_slider # Add settings inputs ], outputs=[output_conv, download_file_conv] # Output to both Textbox and File ) # Launch the Gradio app if __name__ == "__main__": print("Launching Gradio App...") print("Make sure the OPENROUTER_API_KEY environment variable is set.") # Use share=True for temporary public link if running locally and need to test demo.launch() # share=True