File size: 9,967 Bytes
bffeb3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import os
from openai import OpenAI
import re # Import regex for parsing conversation turns
from typing import Optional, Union # Need Optional for settings

# Ensure the OPENROUTER_API_KEY environment variable is set
api_key = "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"
if not api_key:
    raise ValueError("OPENROUTER_API_KEY environment variable not set.")

# Point the OpenAI client to the OpenRouter API
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)

# --- Core Generation Functions ---

def generate_synthetic_text(
    prompt: str,
    model: str = "deepseek/deepseek-chat-v3-0324:free",
    system_message: str = "You are a helpful assistant generating synthetic data.",
    temperature: Optional[float] = 0.7, # Default temperature
    top_p: Optional[float] = None,       # Default top_p (let API decide if None)
    max_tokens: Optional[int] = None   # Default max_tokens (let API decide if None)
) -> str:
    """
    Generates synthetic text using an OpenRouter model via Chat Completions,
    including model parameter controls.

    Args:
        prompt: The user's input prompt.
        model: The model ID.
        system_message: The system message context.
        temperature: Controls randomness (0.0 to 2.0). None means API default.
        top_p: Nucleus sampling probability. None means API default.
        max_tokens: Maximum number of tokens to generate. None means API default.

    Returns:
        The generated text string or an error message.
    """
    if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
         return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."

    # Prepare parameters, only including them if they are not None
    params = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        "extra_headers": {
             # "HTTP-Referer": "YOUR_SITE_URL",
             "X-Title": "SynthGen",
         }
    }
    if temperature is not None:
        params["temperature"] = temperature
    if top_p is not None:
        params["top_p"] = top_p
    if max_tokens is not None:
        params["max_tokens"] = max_tokens

    try:
        response = client.chat.completions.create(**params) # Use dictionary unpacking

        if response.choices and response.choices[0].message and response.choices[0].message.content:
            return response.choices[0].message.content.strip()
        else:
            print(f"Warning: No content in response for model {model}. Response: {response}")
            return "Error: No content generated by the model."
    except Exception as e:
        print(f"Error during API call to model {model}: {e}")
        return f"Error during API call: {e}"

def generate_prompts(
    num_prompts: int,
    model: str,
    topic_hint: str = "diverse and interesting",
    temperature: Optional[float] = 0.7, # Pass settings through
    top_p: Optional[float] = None,
    max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
) -> list[str]:
    """
    Generates a list of conversation prompts using an AI model.

    Args:
        num_prompts: The number of prompts to generate.
        model: The model ID to use for generation.
        topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
        temperature: Controls randomness (0.0 to 2.0). None means API default.
        top_p: Nucleus sampling probability. None means API default.
        max_tokens: Maximum number of tokens to generate. None means API default.

    Returns:
        A list of generated prompts.
    """
    instruction = (
        f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
        f"for generating synthetic conversations between a user and an AI assistant. "
        f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
        f"Present each prompt on a new line, with no other introductory or concluding text."
        f"\n\nExamples:\n"
        f"- Act as a travel agent planning a trip to Japan.\n"
        f"- Explain the concept of black holes to a 5-year-old.\n"
        f"- Write a python function to reverse a string."
    )
    system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."

    # Pass the settings down to generate_synthetic_text
    generated_text = generate_synthetic_text(
        instruction,
        model,
        system_message=system_msg,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )

    if generated_text.startswith("Error:"):
        raise ValueError(generated_text)

    # Split into lines and clean up any extra whitespace or empty lines
    prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
    prompts = [p.replace("- ", "") for p in prompts]
    if not prompts:
        # Log the raw generated text if parsing failed
        print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
        raise ValueError("AI failed to generate prompts in the expected format.")

    # Optional: Truncate or pad if the model didn't generate the exact number
    return prompts[:num_prompts]


def generate_synthetic_conversation(
    system_prompt: str,
    model: str,
    num_turns: int,
    temperature: Optional[float] = 0.7, # Pass settings through
    top_p: Optional[float] = None,
    max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
) -> str:
    """
    Generates a synthetic conversation with a specified number of turns.

    Args:
        system_prompt: The initial system prompt defining the context or AI persona.
        model: The model ID to use for generation.
        num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
        temperature: Controls randomness (0.0 to 2.0). None means API default.
        top_p: Nucleus sampling probability. None means API default.
        max_tokens: Maximum number of tokens to generate. None means API default.

    Returns:
        A string containing the formatted conversation.
    """
    # We'll ask the model to generate the whole conversation in one go for simplicity.
    # More complex approaches could involve iterative calls.
    instruction = (
        f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
        f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
        f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
        f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
        f"Example Format:\n"
        f"User: Hello!\n"
        f"Assistant: Hi there! How can I help you today?\n"
        f"User: Can you explain photosynthesis?\n"
        f"Assistant: Certainly! Photosynthesis is the process..."
    )

    # Use the user-provided system prompt for the *conversation's* context,
    # but a generic one for the generation *task* itself.
    system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"

    # Pass the settings down to generate_synthetic_text
    conversation_text = generate_synthetic_text(
        prompt=instruction,
        model=model,
        system_message=system_msg_for_generation,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )

    if conversation_text.startswith("Error:"):
        # Propagate the error message
        return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"

    # Basic validation/cleanup (optional)
    if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE):
         print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
         # Return the raw text anyway, maybe the model format is slightly different
         return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"

    return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"


# --- Main Execution (Example Usage) ---
if __name__ == "__main__":
    print("--- Testing Basic Text Generation ---")
    test_prompt = "Describe the benefits of using synthetic data."
    text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
    print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")

    print("\n--- Testing Prompt Generation ---")
    try:
        num_prompts_to_gen = 3
        prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
        print(f"Generated {len(prompts_result)} prompts:")
        for i, p in enumerate(prompts_result):
            print(f"{i+1}. {p}")
    except ValueError as e:
        print(f"Error generating prompts: {e}")

    print("\n--- Testing Conversation Generation ---")
    conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
    num_conv_turns = 3
    conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
    print(f"{conv_result}\n")

    print("\n--- Testing with Invalid API Key (if applicable) ---")
    # Temporarily use an invalid key for testing error handling
    original_key = client.api_key
    client.api_key = "invalid-key"
    error_text_result = generate_synthetic_text("Test prompt")
    print(f"Result with invalid key: {error_text_result}")
    client.api_key = original_key # Restore original key

    print("\nGeneration tests complete.")