Spaces:
Running
Running
File size: 9,967 Bytes
bffeb3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
import os
from openai import OpenAI
import re # Import regex for parsing conversation turns
from typing import Optional, Union # Need Optional for settings
# Ensure the OPENROUTER_API_KEY environment variable is set
api_key = "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"
if not api_key:
raise ValueError("OPENROUTER_API_KEY environment variable not set.")
# Point the OpenAI client to the OpenRouter API
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
# --- Core Generation Functions ---
def generate_synthetic_text(
prompt: str,
model: str = "deepseek/deepseek-chat-v3-0324:free",
system_message: str = "You are a helpful assistant generating synthetic data.",
temperature: Optional[float] = 0.7, # Default temperature
top_p: Optional[float] = None, # Default top_p (let API decide if None)
max_tokens: Optional[int] = None # Default max_tokens (let API decide if None)
) -> str:
"""
Generates synthetic text using an OpenRouter model via Chat Completions,
including model parameter controls.
Args:
prompt: The user's input prompt.
model: The model ID.
system_message: The system message context.
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
The generated text string or an error message.
"""
if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."
# Prepare parameters, only including them if they are not None
params = {
"model": model,
"messages": [
{"role": "system", "content": system_message},
{"role": "user", "content": prompt},
],
"extra_headers": {
# "HTTP-Referer": "YOUR_SITE_URL",
"X-Title": "SynthGen",
}
}
if temperature is not None:
params["temperature"] = temperature
if top_p is not None:
params["top_p"] = top_p
if max_tokens is not None:
params["max_tokens"] = max_tokens
try:
response = client.chat.completions.create(**params) # Use dictionary unpacking
if response.choices and response.choices[0].message and response.choices[0].message.content:
return response.choices[0].message.content.strip()
else:
print(f"Warning: No content in response for model {model}. Response: {response}")
return "Error: No content generated by the model."
except Exception as e:
print(f"Error during API call to model {model}: {e}")
return f"Error during API call: {e}"
def generate_prompts(
num_prompts: int,
model: str,
topic_hint: str = "diverse and interesting",
temperature: Optional[float] = 0.7, # Pass settings through
top_p: Optional[float] = None,
max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
) -> list[str]:
"""
Generates a list of conversation prompts using an AI model.
Args:
num_prompts: The number of prompts to generate.
model: The model ID to use for generation.
topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
A list of generated prompts.
"""
instruction = (
f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
f"for generating synthetic conversations between a user and an AI assistant. "
f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
f"Present each prompt on a new line, with no other introductory or concluding text."
f"\n\nExamples:\n"
f"- Act as a travel agent planning a trip to Japan.\n"
f"- Explain the concept of black holes to a 5-year-old.\n"
f"- Write a python function to reverse a string."
)
system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."
# Pass the settings down to generate_synthetic_text
generated_text = generate_synthetic_text(
instruction,
model,
system_message=system_msg,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
if generated_text.startswith("Error:"):
raise ValueError(generated_text)
# Split into lines and clean up any extra whitespace or empty lines
prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
prompts = [p.replace("- ", "") for p in prompts]
if not prompts:
# Log the raw generated text if parsing failed
print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
raise ValueError("AI failed to generate prompts in the expected format.")
# Optional: Truncate or pad if the model didn't generate the exact number
return prompts[:num_prompts]
def generate_synthetic_conversation(
system_prompt: str,
model: str,
num_turns: int,
temperature: Optional[float] = 0.7, # Pass settings through
top_p: Optional[float] = None,
max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
) -> str:
"""
Generates a synthetic conversation with a specified number of turns.
Args:
system_prompt: The initial system prompt defining the context or AI persona.
model: The model ID to use for generation.
num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
A string containing the formatted conversation.
"""
# We'll ask the model to generate the whole conversation in one go for simplicity.
# More complex approaches could involve iterative calls.
instruction = (
f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
f"Example Format:\n"
f"User: Hello!\n"
f"Assistant: Hi there! How can I help you today?\n"
f"User: Can you explain photosynthesis?\n"
f"Assistant: Certainly! Photosynthesis is the process..."
)
# Use the user-provided system prompt for the *conversation's* context,
# but a generic one for the generation *task* itself.
system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"
# Pass the settings down to generate_synthetic_text
conversation_text = generate_synthetic_text(
prompt=instruction,
model=model,
system_message=system_msg_for_generation,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
if conversation_text.startswith("Error:"):
# Propagate the error message
return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"
# Basic validation/cleanup (optional)
if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE):
print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
# Return the raw text anyway, maybe the model format is slightly different
return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
# --- Main Execution (Example Usage) ---
if __name__ == "__main__":
print("--- Testing Basic Text Generation ---")
test_prompt = "Describe the benefits of using synthetic data."
text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")
print("\n--- Testing Prompt Generation ---")
try:
num_prompts_to_gen = 3
prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
print(f"Generated {len(prompts_result)} prompts:")
for i, p in enumerate(prompts_result):
print(f"{i+1}. {p}")
except ValueError as e:
print(f"Error generating prompts: {e}")
print("\n--- Testing Conversation Generation ---")
conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
num_conv_turns = 3
conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
print(f"{conv_result}\n")
print("\n--- Testing with Invalid API Key (if applicable) ---")
# Temporarily use an invalid key for testing error handling
original_key = client.api_key
client.api_key = "invalid-key"
error_text_result = generate_synthetic_text("Test prompt")
print(f"Result with invalid key: {error_text_result}")
client.api_key = original_key # Restore original key
print("\nGeneration tests complete.")
|