biswanath2.roul commited on
Commit
e54fd17
·
0 Parent(s):

Initial commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ env/
26
+ venv/
27
+ ENV/
28
+ env.bak/
29
+ venv.bak/
30
+ pl200525/
31
+
32
+ # Jupyter Notebook
33
+ .ipynb_checkpoints
34
+
35
+ # Prompt storage (for local development)
36
+ promptlab_storage/
37
+
38
+ # IDE
39
+ .idea/
40
+ .vscode/
41
+ *.swp
42
+ *.swo
43
+
44
+ # Distribution / packaging
45
+ .Python
46
+ env/
47
+ build/
48
+ develop-eggs/
49
+ dist/
50
+ downloads/
51
+ eggs/
52
+ .eggs/
53
+ lib/
54
+ lib64/
55
+ parts/
56
+ sdist/
57
+ var/
58
+ *.egg-info/
59
+ .installed.cfg
60
+ *.egg
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Biswanath Roul
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PromptLab: LLM Prompt Management System
2
+
3
+ PromptLab is a comprehensive library for managing, versioning, testing, and evaluating prompts for Large Language Models (LLMs). It provides a structured framework to help data scientists and developers create, optimize, and maintain high-quality prompts.
4
+
5
+ ## Features
6
+
7
+ - **Prompt Management**: Create, update, and organize prompts with metadata and tags
8
+ - **Version Control**: Track prompt changes over time with full version history
9
+ - **A/B Testing**: Compare different prompt variations to find the most effective one
10
+ - **Evaluation Framework**: Measure prompt quality with customizable metrics
11
+ - **Advanced Templating**: Create dynamic prompts with variables, conditionals, and loops
12
+ - **Command-line Interface**: Easily integrate into your workflow
13
+
14
+ ## Documentation
15
+
16
+ For detailed documentation, see the [docs](./docs) directory:
17
+
18
+ - [Getting Started](./docs/getting_started.md)
19
+ - [API Reference](./docs/api_reference.md)
20
+ - [CLI Usage](./docs/cli_usage.md)
21
+ - [Advanced Features](./docs/advanced_features.md)
22
+ - [Integration Examples](./docs/integration_examples.md)
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install promptlab
28
+
29
+ Quick Start
30
+
31
+ from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
32
+
33
+ # Initialize components
34
+ prompt_manager = PromptManager()
35
+ version_control = VersionControl(prompt_manager)
36
+ testing = PromptTesting(prompt_manager)
37
+ evaluator = Evaluator(prompt_manager)
38
+
39
+ # Create a prompt
40
+ prompt = prompt_manager.create(
41
+ content="Summarize the following text: {text}",
42
+ name="Simple Summarization",
43
+ description="A simple prompt for text summarization",
44
+ tags=["summarization", "basic"]
45
+ )
46
+
47
+ # Create a new version
48
+ version_control.commit(
49
+ prompt_id=prompt.id,
50
+ commit_message="Initial version"
51
+ )
52
+
53
+ # Update the prompt
54
+ prompt_manager.update(
55
+ prompt.id,
56
+ content="Please provide a concise summary of the following text in 2-3 sentences: {text}"
57
+ )
58
+
59
+ # Commit the updated version
60
+ version_control.commit(
61
+ prompt_id=prompt.id,
62
+ commit_message="Improved prompt with length guidance"
63
+ )
64
+
65
+ # Create a test case
66
+ test_case = testing.create_test_case(
67
+ prompt_id=prompt.id,
68
+ input_vars={"text": "Lorem ipsum dolor sit amet..."},
69
+ expected_output="This is a summary of the text."
70
+ )
71
+
72
+ # Define an LLM callback for testing
73
+ async def llm_callback(prompt, vars):
74
+ # In a real scenario, this would call an actual LLM API
75
+ return "This is a summary of the text."
76
+
77
+ # Run the test case
78
+ import asyncio
79
+ test_result = asyncio.run(testing.run_test_case(
80
+ test_case_id=test_case.id,
81
+ llm_callback=llm_callback
82
+ ))
83
+
84
+ # Evaluate a prompt with multiple inputs
85
+ evaluation_result = asyncio.run(evaluator.evaluate_prompt(
86
+ prompt_id=prompt.id,
87
+ inputs=[{"text": "Sample text 1"}, {"text": "Sample text 2"}],
88
+ llm_callback=llm_callback
89
+ ))
90
+
91
+ print(f"Evaluation metrics: {evaluation_result['aggregated_metrics']}")
92
+
93
+ Command-line Interface
94
+ PromptLab comes with a powerful CLI for managing prompts:
95
+
96
+ # Create a prompt
97
+ promptlab prompt create "Summarization" --content "Summarize: {text}" --tags "summarization,basic"
98
+
99
+ # List all prompts
100
+ promptlab prompt list
101
+
102
+ # Create a new version
103
+ promptlab version commit <prompt_id> --message "Updated prompt"
104
+
105
+ # Run tests
106
+ promptlab test run-all <prompt_id> --llm openai
107
+
108
+ Advanced Usage
109
+ Advanced Templating
110
+ PromptLab supports advanced templating with conditionals and loops:
111
+
112
+ from promptlab import PromptTemplate
113
+
114
+ template = PromptTemplate("""
115
+ {system_message}
116
+
117
+ {for example in examples}
118
+ Input: {example.input}
119
+ Output: {example.output}
120
+ {endfor}
121
+
122
+ Input: {input}
123
+ Output:
124
+ """)
125
+
126
+ rendered = template.render(
127
+ system_message="You are a helpful assistant.",
128
+ examples=[
129
+ {"input": "Hello", "output": "Hi there!"},
130
+ {"input": "How are you?", "output": "I'm doing well, thanks!"}
131
+ ],
132
+ input="What's the weather like?"
133
+ )
134
+
135
+ Custom Evaluation Metrics
136
+ Create custom metrics to evaluate prompt performance:
137
+ from promptlab import EvaluationMetric, Evaluator
138
+
139
+ class CustomMetric(EvaluationMetric):
140
+ def __init__(self):
141
+ super().__init__("custom_metric", "My custom evaluation metric")
142
+
143
+ def compute(self, generated_output, expected_output=None, **kwargs):
144
+ # Custom logic to score the output
145
+ return score # A float between 0 and 1
146
+
147
+ # Register the custom metric
148
+ evaluator = Evaluator(prompt_manager)
149
+ evaluator.register_metric(CustomMetric())
150
+
151
+ Use Cases
152
+
153
+ Prompt Development: Iteratively develop and refine prompts with version control
154
+ Prompt Optimization: A/B test different prompt variations to find the most effective approach
155
+ Quality Assurance: Ensure prompt quality with automated testing and evaluation
156
+ Team Collaboration: Share and collaborate on prompts with a centralized management system
157
+ Production Deployment: Maintain consistent prompt quality in production applications
158
+
159
+ License
160
+ MIT License
161
+
162
+ ## Contributing
163
+ Contributions are welcome! Please feel free to submit a Pull Request.
164
+
165
+ ## Author
166
+ Biswanath Roul - [GitHub](https://github.com/biswanathroul)
167
+
docs/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PromptLab Documentation
2
+
3
+ This directory contains detailed documentation for the PromptLab library.
4
+
5
+ ## Contents
6
+
7
+ - [Getting Started](./getting_started.md)
8
+ - [API Reference](./api_reference.md)
9
+ - [CLI Usage](./cli_usage.md)
10
+ - [Advanced Features](./advanced_features.md)
11
+ - [Integration Examples](./integration_examples.md)
docs/advanced_features.md ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Advanced Features
2
+
3
+ PromptLab provides several advanced features for sophisticated prompt engineering.
4
+
5
+ ## Advanced Templating
6
+
7
+ PromptLab's templating system goes beyond simple variable substitution, offering conditionals and loops.
8
+
9
+ ### Basic Variable Substitution
10
+
11
+ ```python
12
+ from promptlab import PromptTemplate
13
+
14
+ # Simple variable substitution
15
+ template = PromptTemplate("Hello, {name}!")
16
+ rendered = template.render(name="John")
17
+ # Result: "Hello, John!"
18
+ ```
19
+
20
+ ### Conditional Logic
21
+
22
+ ```python
23
+ # Conditionals
24
+ template = PromptTemplate("""
25
+ {if is_formal}
26
+ Dear {name},
27
+
28
+ I hope this message finds you well.
29
+ {else}
30
+ Hey {name}!
31
+ {endif}
32
+
33
+ {message}
34
+ """)
35
+
36
+ formal = template.render(is_formal=True, name="Dr. Smith", message="Please review the attached document.")
37
+ casual = template.render(is_formal=False, name="Bob", message="Want to grab lunch?")
38
+ ```
39
+
40
+ ### Loops
41
+
42
+ ```python
43
+ # Loops
44
+ template = PromptTemplate("""
45
+ Here are your tasks:
46
+
47
+ {for task in tasks}
48
+ - {task.priority}: {task.description}
49
+ {endfor}
50
+ """)
51
+
52
+ rendered = template.render(tasks=[
53
+ {"priority": "High", "description": "Complete the report"},
54
+ {"priority": "Medium", "description": "Schedule meeting"},
55
+ {"priority": "Low", "description": "Organize files"}
56
+ ])
57
+ ```
58
+
59
+ ### Nested Structures
60
+
61
+ ```python
62
+ # Combining loops and conditionals
63
+ template = PromptTemplate("""
64
+ {system_message}
65
+
66
+ {for example in examples}
67
+ User: {example.input}
68
+ {if example.has_reasoning}
69
+ Reasoning: {example.reasoning}
70
+ {endif}
71
+ Assistant: {example.output}
72
+ {endfor}
73
+
74
+ User: {query}
75
+ Assistant:
76
+ """)
77
+
78
+ rendered = template.render(
79
+ system_message="You are a helpful assistant.",
80
+ examples=[
81
+ {
82
+ "input": "What's 2+2?",
83
+ "has_reasoning": True,
84
+ "reasoning": "Adding 2 and 2 gives 4",
85
+ "output": "4"
86
+ },
87
+ {
88
+ "input": "Hello",
89
+ "has_reasoning": False,
90
+ "output": "Hi there! How can I help you today?"
91
+ }
92
+ ],
93
+ query="What's the capital of France?"
94
+ )
95
+ ```
96
+
97
+ ## Custom Evaluation Metrics
98
+
99
+ You can create custom metrics to evaluate prompt outputs based on your specific requirements.
100
+
101
+ ### Creating a Custom Metric
102
+
103
+ ```python
104
+ from promptlab import EvaluationMetric
105
+
106
+ class RelevanceMetric(EvaluationMetric):
107
+ """Evaluates relevance of output to a given topic."""
108
+
109
+ def __init__(self, topics):
110
+ super().__init__("relevance", "Evaluates relevance to specified topics")
111
+ self.topics = topics
112
+
113
+ def compute(self, generated_output, expected_output=None, **kwargs):
114
+ """
115
+ Compute relevance score based on topic presence.
116
+ Returns a float between 0 and 1.
117
+ """
118
+ score = 0
119
+ output_lower = generated_output.lower()
120
+
121
+ for topic in self.topics:
122
+ if topic.lower() in output_lower:
123
+ score += 1
124
+
125
+ # Normalize to 0-1 range
126
+ return min(1.0, score / len(self.topics)) if self.topics else 0.0
127
+ ```
128
+
129
+ ### Using Custom Metrics
130
+
131
+ ```python
132
+ from promptlab import Evaluator, PromptManager
133
+
134
+ # Initialize components
135
+ prompt_manager = PromptManager()
136
+ evaluator = Evaluator(prompt_manager)
137
+
138
+ # Register custom metric
139
+ climate_relevance = RelevanceMetric(["climate", "temperature", "warming", "environment"])
140
+ evaluator.register_metric(climate_relevance)
141
+
142
+ # Use in evaluation
143
+ async def my_llm(prompt, vars):
144
+ # Call your LLM API here
145
+ return "Climate change is causing global temperature increases..."
146
+
147
+ results = await evaluator.evaluate_prompt(
148
+ prompt_id="abc123",
149
+ inputs=[{"topic": "climate change"}],
150
+ llm_callback=my_llm,
151
+ metric_names=["relevance"] # Use our custom metric
152
+ )
153
+
154
+ print(f"Relevance score: {results['aggregated_metrics']['relevance']}")
155
+ ```
156
+
157
+ ## Customizing Storage
158
+
159
+ PromptLab allows you to customize where and how prompts and related data are stored.
160
+
161
+ ### Custom Storage Locations
162
+
163
+ ```python
164
+ # Specify a custom storage location
165
+ prompt_manager = PromptManager("/path/to/my/prompts")
166
+
167
+ # Export/import prompts
168
+ import json
169
+
170
+ # Export a prompt to a file
171
+ prompt = prompt_manager.get("abc123")
172
+ with open("exported_prompt.json", "w") as f:
173
+ json.dump(prompt.to_dict(), f, indent=2)
174
+
175
+ # Import a prompt from a file
176
+ with open("exported_prompt.json", "r") as f:
177
+ data = json.load(f)
178
+ imported_prompt = prompt_manager.import_prompt(data)
179
+ ```
180
+
181
+ ## LLM Integration
182
+
183
+ PromptLab is designed to work with any LLM through callback functions. Here are examples of integrating with popular LLM APIs.
184
+
185
+ ### OpenAI Integration
186
+
187
+ ```python
188
+ import openai
189
+ from promptlab import PromptManager, PromptTesting
190
+
191
+ prompt_manager = PromptManager()
192
+ testing = PromptTesting(prompt_manager)
193
+
194
+ # Configure OpenAI
195
+ openai.api_key = "your-api-key"
196
+
197
+ # OpenAI callback function
198
+ async def openai_callback(prompt, vars):
199
+ response = openai.ChatCompletion.create(
200
+ model="gpt-4",
201
+ messages=[{"role": "user", "content": prompt}],
202
+ temperature=0.7,
203
+ max_tokens=150
204
+ )
205
+ return response.choices[0].message.content
206
+
207
+ # Run tests with OpenAI
208
+ test_results = await testing.run_all_tests("abc123", openai_callback)
209
+ ```
210
+
211
+ ### Anthropic Integration
212
+
213
+ ```python
214
+ import anthropic
215
+ from promptlab import PromptManager, Evaluator
216
+
217
+ prompt_manager = PromptManager()
218
+ evaluator = Evaluator(prompt_manager)
219
+
220
+ # Configure Anthropic
221
+ client = anthropic.Anthropic(api_key="your-api-key")
222
+
223
+ # Anthropic callback function
224
+ async def anthropic_callback(prompt, vars):
225
+ response = client.messages.create(
226
+ model="claude-2",
227
+ messages=[{"role": "user", "content": prompt}],
228
+ max_tokens=150
229
+ )
230
+ return response.content[0].text
231
+
232
+ # Evaluate with Anthropic
233
+ eval_results = await evaluator.evaluate_prompt(
234
+ prompt_id="abc123",
235
+ inputs=[{"query": "What is machine learning?"}],
236
+ llm_callback=anthropic_callback
237
+ )
238
+ ```
239
+
240
+ ### Hugging Face Integration
241
+
242
+ ```python
243
+ from transformers import pipeline
244
+ import asyncio
245
+ from promptlab import PromptManager, VersionControl
246
+
247
+ prompt_manager = PromptManager()
248
+ version_control = VersionControl(prompt_manager)
249
+
250
+ # Set up Hugging Face pipeline
251
+ generator = pipeline('text-generation', model='gpt2')
252
+
253
+ # Hugging Face callback function
254
+ async def hf_callback(prompt, vars):
255
+ # Run synchronously but in a way that doesn't block the asyncio event loop
256
+ loop = asyncio.get_event_loop()
257
+ result = await loop.run_in_executor(None, lambda: generator(prompt, max_length=100)[0]['generated_text'])
258
+ return result
259
+
260
+ # Use with version control
261
+ prompt = prompt_manager.create(
262
+ content="Complete this: {text}",
263
+ name="Text Completion"
264
+ )
265
+ version_control.commit(prompt.id, "Initial version")
266
+
267
+ # Test with different models by swapping the callback
268
+ ```
docs/api_reference.md ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Reference
2
+
3
+ This document provides detailed API documentation for the main components of PromptLab.
4
+
5
+ ## PromptManager
6
+
7
+ The `PromptManager` class is the core component for managing prompts.
8
+
9
+ ```python
10
+ from promptlab import PromptManager
11
+ ```
12
+
13
+ ### Methods
14
+
15
+ #### `__init__(storage_path=None)`
16
+ - **Description**: Initialize a new PromptManager.
17
+ - **Parameters**:
18
+ - `storage_path` (str, optional): Path to store prompts. Defaults to "~/promptlab_storage".
19
+
20
+ #### `create(content, name, description='', tags=None, metadata=None)`
21
+ - **Description**: Create a new prompt.
22
+ - **Parameters**:
23
+ - `content` (str): The prompt text with optional variables in {variable_name} format.
24
+ - `name` (str): Name of the prompt.
25
+ - `description` (str, optional): Description of the prompt.
26
+ - `tags` (list of str, optional): Tags for categorization.
27
+ - `metadata` (dict, optional): Additional metadata.
28
+ - **Returns**: `Prompt` object.
29
+
30
+ #### `get(prompt_id)`
31
+ - **Description**: Get a prompt by ID.
32
+ - **Parameters**:
33
+ - `prompt_id` (str): The ID of the prompt.
34
+ - **Returns**: `Prompt` object or None if not found.
35
+
36
+ #### `update(prompt_id, content=None, name=None, description=None, tags=None, metadata=None)`
37
+ - **Description**: Update a prompt.
38
+ - **Parameters**:
39
+ - `prompt_id` (str): The ID of the prompt to update.
40
+ - `content` (str, optional): New prompt text.
41
+ - `name` (str, optional): New name.
42
+ - `description` (str, optional): New description.
43
+ - `tags` (list of str, optional): New tags.
44
+ - `metadata` (dict, optional): New metadata.
45
+ - **Returns**: Updated `Prompt` object.
46
+
47
+ #### `delete(prompt_id)`
48
+ - **Description**: Delete a prompt.
49
+ - **Parameters**:
50
+ - `prompt_id` (str): The ID of the prompt to delete.
51
+ - **Returns**: True if deleted, False otherwise.
52
+
53
+ #### `list_all()`
54
+ - **Description**: List all prompts.
55
+ - **Returns**: List of `Prompt` objects.
56
+
57
+ #### `search_by_tags(tags, match_all=False)`
58
+ - **Description**: Search prompts by tags.
59
+ - **Parameters**:
60
+ - `tags` (list of str): Tags to search for.
61
+ - `match_all` (bool, optional): If True, prompt must have all tags.
62
+ - **Returns**: List of matching `Prompt` objects.
63
+
64
+ ## VersionControl
65
+
66
+ The `VersionControl` class manages prompt versions.
67
+
68
+ ```python
69
+ from promptlab import VersionControl
70
+ ```
71
+
72
+ ### Methods
73
+
74
+ #### `__init__(prompt_manager)`
75
+ - **Description**: Initialize the version control system.
76
+ - **Parameters**:
77
+ - `prompt_manager` (PromptManager): A PromptManager instance.
78
+
79
+ #### `commit(prompt_id, commit_message, metadata=None)`
80
+ - **Description**: Create a new version of a prompt.
81
+ - **Parameters**:
82
+ - `prompt_id` (str): The ID of the prompt.
83
+ - `commit_message` (str): Message describing the changes.
84
+ - `metadata` (dict, optional): Additional version metadata.
85
+ - **Returns**: Version number (int).
86
+
87
+ #### `list_versions(prompt_id)`
88
+ - **Description**: List all versions of a prompt.
89
+ - **Parameters**:
90
+ - `prompt_id` (str): The ID of the prompt.
91
+ - **Returns**: List of version objects.
92
+
93
+ #### `get_version(prompt_id, version_number)`
94
+ - **Description**: Get a specific version of a prompt.
95
+ - **Parameters**:
96
+ - `prompt_id` (str): The ID of the prompt.
97
+ - `version_number` (int): The version number.
98
+ - **Returns**: Version data.
99
+
100
+ #### `checkout(prompt_id, version_number)`
101
+ - **Description**: Revert a prompt to a specific version.
102
+ - **Parameters**:
103
+ - `prompt_id` (str): The ID of the prompt.
104
+ - `version_number` (int): The version to revert to.
105
+ - **Returns**: Updated `Prompt` object.
106
+
107
+ #### `diff(prompt_id, version1, version2)`
108
+ - **Description**: Compare two versions of a prompt.
109
+ - **Parameters**:
110
+ - `prompt_id` (str): The ID of the prompt.
111
+ - `version1` (int): First version number.
112
+ - `version2` (int): Second version number.
113
+ - **Returns**: Diff object.
114
+
115
+ ## PromptTesting
116
+
117
+ The `PromptTesting` class provides testing capabilities.
118
+
119
+ ```python
120
+ from promptlab import PromptTesting
121
+ ```
122
+
123
+ ### Methods
124
+
125
+ #### `__init__(prompt_manager)`
126
+ - **Description**: Initialize the testing system.
127
+ - **Parameters**:
128
+ - `prompt_manager` (PromptManager): A PromptManager instance.
129
+
130
+ #### `create_test_case(prompt_id, input_vars, expected_output=None, name=None, description=None)`
131
+ - **Description**: Create a test case for a prompt.
132
+ - **Parameters**:
133
+ - `prompt_id` (str): The ID of the prompt to test.
134
+ - `input_vars` (dict): Variables to substitute in the prompt.
135
+ - `expected_output` (str, optional): Expected response.
136
+ - `name` (str, optional): Test case name.
137
+ - `description` (str, optional): Test case description.
138
+ - **Returns**: Test case object.
139
+
140
+ #### `run_test_case(test_case_id, llm_callback)`
141
+ - **Description**: Run a test case.
142
+ - **Parameters**:
143
+ - `test_case_id` (str): The ID of the test case.
144
+ - `llm_callback` (callable): Function to call LLM.
145
+ - **Returns**: Test result.
146
+
147
+ #### `run_all_tests(prompt_id, llm_callback)`
148
+ - **Description**: Run all tests for a prompt.
149
+ - **Parameters**:
150
+ - `prompt_id` (str): The ID of the prompt.
151
+ - `llm_callback` (callable): Function to call LLM.
152
+ - **Returns**: List of test results.
153
+
154
+ #### `ab_test(prompt_id_a, prompt_id_b, test_cases, llm_callback, metrics=None)`
155
+ - **Description**: Run A/B tests comparing two prompts.
156
+ - **Parameters**:
157
+ - `prompt_id_a` (str): First prompt ID.
158
+ - `prompt_id_b` (str): Second prompt ID.
159
+ - `test_cases` (list): Test cases to run.
160
+ - `llm_callback` (callable): Function to call LLM.
161
+ - `metrics` (list, optional): Metrics to compare.
162
+ - **Returns**: A/B test results.
163
+
164
+ ## Evaluator
165
+
166
+ The `Evaluator` class handles prompt evaluation.
167
+
168
+ ```python
169
+ from promptlab import Evaluator
170
+ ```
171
+
172
+ ### Methods
173
+
174
+ #### `__init__(prompt_manager)`
175
+ - **Description**: Initialize the evaluator.
176
+ - **Parameters**:
177
+ - `prompt_manager` (PromptManager): A PromptManager instance.
178
+
179
+ #### `register_metric(metric)`
180
+ - **Description**: Register a new evaluation metric.
181
+ - **Parameters**:
182
+ - `metric` (EvaluationMetric): The metric to register.
183
+
184
+ #### `evaluate_prompt(prompt_id, inputs, llm_callback, expected_outputs=None, metric_names=None)`
185
+ - **Description**: Evaluate a prompt with the given inputs and metrics.
186
+ - **Parameters**:
187
+ - `prompt_id` (str): The ID of the prompt.
188
+ - `inputs` (list): List of input dictionaries.
189
+ - `llm_callback` (callable): Function to call LLM.
190
+ - `expected_outputs` (list, optional): Expected outputs.
191
+ - `metric_names` (list, optional): Metrics to use.
192
+ - **Returns**: Evaluation results.
193
+
194
+ ## PromptTemplate
195
+
196
+ The `PromptTemplate` class provides advanced templating.
197
+
198
+ ```python
199
+ from promptlab import PromptTemplate
200
+ ```
201
+
202
+ ### Methods
203
+
204
+ #### `__init__(template_string)`
205
+ - **Description**: Initialize a template.
206
+ - **Parameters**:
207
+ - `template_string` (str): Template with variables, conditionals, and loops.
208
+
209
+ #### `render(**variables)`
210
+ - **Description**: Render the template with given variables.
211
+ - **Parameters**:
212
+ - `variables` (dict): Variables to substitute.
213
+ - **Returns**: Rendered string.
214
+
215
+ ## EvaluationMetric
216
+
217
+ The `EvaluationMetric` is the base class for evaluation metrics.
218
+
219
+ ```python
220
+ from promptlab import EvaluationMetric
221
+ ```
222
+
223
+ ### Methods
224
+
225
+ #### `__init__(name, description=None)`
226
+ - **Description**: Initialize a metric.
227
+ - **Parameters**:
228
+ - `name` (str): Metric name.
229
+ - `description` (str, optional): Metric description.
230
+
231
+ #### `compute(generated_output, expected_output=None, **kwargs)`
232
+ - **Description**: Compute the metric score.
233
+ - **Parameters**:
234
+ - `generated_output` (str): Output from LLM.
235
+ - `expected_output` (str, optional): Expected output.
236
+ - `**kwargs`: Additional parameters.
237
+ - **Returns**: Score (float between 0 and 1).
238
+
239
+ ### Built-in Metrics
240
+
241
+ - `ExactMatchMetric`: Scores exact matches between generated and expected output.
242
+ - `ContainsKeywordsMetric`: Scores based on keyword presence.
243
+ - `LengthMetric`: Scores based on output length.
244
+
245
+ ```python
246
+ from promptlab import ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
247
+ ```
docs/cli_usage.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLI Usage
2
+
3
+ PromptLab provides a command-line interface (CLI) for managing prompts, versions, tests, and evaluations.
4
+
5
+ ## Basic Commands
6
+
7
+ ### Prompt Management
8
+
9
+ ```bash
10
+ # Create a prompt
11
+ promptlab prompt create "Weather Forecast" --content "Provide a weather forecast for {location} on {date}" --tags "weather,forecast"
12
+
13
+ # List all prompts
14
+ promptlab prompt list
15
+
16
+ # Get prompt details
17
+ promptlab prompt get <prompt_id>
18
+
19
+ # Update a prompt
20
+ promptlab prompt update <prompt_id> --content "New content" --tags "new,tags"
21
+
22
+ # Delete a prompt
23
+ promptlab prompt delete <prompt_id>
24
+ ```
25
+
26
+ ### Version Control
27
+
28
+ ```bash
29
+ # Commit a version
30
+ promptlab version commit <prompt_id> --message "Version description"
31
+
32
+ # List versions
33
+ promptlab version list <prompt_id>
34
+
35
+ # Check out (revert to) a specific version
36
+ promptlab version checkout <prompt_id> <version_number>
37
+
38
+ # Compare versions
39
+ promptlab version diff <prompt_id> <version1> <version2>
40
+ ```
41
+
42
+ ### Testing
43
+
44
+ ```bash
45
+ # Create a test case
46
+ promptlab test create <prompt_id> --input '{"location": "New York", "date": "tomorrow"}' --expected "Expected output"
47
+
48
+ # List test cases
49
+ promptlab test list <prompt_id>
50
+
51
+ # Run a specific test case
52
+ promptlab test run <test_case_id> --llm openai
53
+
54
+ # Run all test cases for a prompt
55
+ promptlab test run-all <prompt_id> --llm openai
56
+
57
+ # Run an A/B test between two prompts
58
+ promptlab test ab <prompt_id_a> <prompt_id_b> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
59
+ ```
60
+
61
+ ### Evaluation
62
+
63
+ ```bash
64
+ # Evaluate a prompt
65
+ promptlab eval run <prompt_id> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
66
+
67
+ # List available metrics
68
+ promptlab eval metrics
69
+
70
+ # Register a custom metric
71
+ promptlab eval register-metric <metric_file.py>
72
+ ```
73
+
74
+ ## Environment Configuration
75
+
76
+ The CLI supports environment variables for configuration:
77
+
78
+ - `PROMPTLAB_STORAGE`: Path to store prompts and related data
79
+ - `PROMPTLAB_OPENAI_API_KEY`: OpenAI API key for built-in LLM support
80
+ - `PROMPTLAB_DEFAULT_LLM`: Default LLM to use for testing and evaluation
81
+
82
+ You can also create a config file at `~/.promptlab/config.json`:
83
+
84
+ ```json
85
+ {
86
+ "storage_path": "/path/to/storage",
87
+ "default_llm": "openai",
88
+ "api_keys": {
89
+ "openai": "your-openai-key"
90
+ }
91
+ }
92
+ ```
93
+
94
+ ## Advanced Usage
95
+
96
+ ### Multiple Storage Locations
97
+
98
+ ```bash
99
+ # Specify a storage location for a command
100
+ promptlab --storage /path/to/storage prompt list
101
+
102
+ # Export a prompt to another storage
103
+ promptlab prompt export <prompt_id> --output /path/to/output.json
104
+
105
+ # Import a prompt from a file
106
+ promptlab prompt import /path/to/prompt.json
107
+ ```
108
+
109
+ ### Automation and Scripting
110
+
111
+ ```bash
112
+ # Get output in JSON format
113
+ promptlab --json prompt list
114
+
115
+ # Use in shell scripts
116
+ PROMPT_ID=$(promptlab --json prompt create "Script Prompt" --content "Content" | jq -r '.id')
117
+ echo "Created prompt with ID: $PROMPT_ID"
118
+ ```
docs/getting_started.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Getting Started with PromptLab
2
+
3
+ This guide will help you get started with PromptLab, a comprehensive library for managing LLM prompts.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install promptlab
9
+ ```
10
+
11
+ ## Basic Usage
12
+
13
+ ### Initialize Components
14
+
15
+ ```python
16
+ from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
17
+
18
+ # Initialize with default storage location
19
+ prompt_manager = PromptManager()
20
+
21
+ # Or specify a custom storage location
22
+ # prompt_manager = PromptManager("/path/to/storage")
23
+
24
+ # Initialize other components
25
+ version_control = VersionControl(prompt_manager)
26
+ testing = PromptTesting(prompt_manager)
27
+ evaluator = Evaluator(prompt_manager)
28
+ ```
29
+
30
+ ### Create and Manage Prompts
31
+
32
+ ```python
33
+ # Create a prompt
34
+ prompt = prompt_manager.create(
35
+ content="Translate the following text from {source_language} to {target_language}: {text}",
36
+ name="Translation Prompt",
37
+ description="A prompt for translating text between languages",
38
+ tags=["translation", "multilingual"]
39
+ )
40
+
41
+ # The prompt.id property contains a unique identifier (e.g., "a1b2c3d4e5")
42
+ prompt_id = prompt.id
43
+
44
+ # Get a prompt by ID
45
+ retrieved_prompt = prompt_manager.get(prompt_id)
46
+
47
+ # Update a prompt
48
+ prompt_manager.update(
49
+ prompt_id,
50
+ content="Please translate the following text from {source_language} to {target_language}:\n\n{text}"
51
+ )
52
+
53
+ # Search prompts by tags
54
+ translation_prompts = prompt_manager.search_by_tags(["translation"])
55
+
56
+ # List all prompts
57
+ all_prompts = prompt_manager.list_all()
58
+ ```
59
+
60
+ ### Version Control
61
+
62
+ ```python
63
+ # Create a version snapshot
64
+ version_control.commit(
65
+ prompt_id=prompt_id,
66
+ commit_message="Initial version"
67
+ )
68
+
69
+ # Update the prompt and create another version
70
+ prompt_manager.update(
71
+ prompt_id,
72
+ content="Please provide a translation of the following text from {source_language} to {target_language}:\n\n{text}\n\nMaintain the original formatting and tone."
73
+ )
74
+
75
+ version_control.commit(
76
+ prompt_id=prompt_id,
77
+ commit_message="Added formatting instructions"
78
+ )
79
+
80
+ # List all versions
81
+ versions = version_control.list_versions(prompt_id)
82
+
83
+ # Compare versions
84
+ diff = version_control.diff(prompt_id, 1, 2)
85
+
86
+ # Revert to a previous version
87
+ version_control.checkout(prompt_id, 1)
88
+ ```
89
+
90
+ ### Using Prompts with Variables
91
+
92
+ ```python
93
+ # Get a prompt
94
+ prompt = prompt_manager.get(prompt_id)
95
+
96
+ # Render with variables
97
+ rendered_prompt = prompt.render(
98
+ source_language="English",
99
+ target_language="Spanish",
100
+ text="Hello, how are you today?"
101
+ )
102
+
103
+ # Now use rendered_prompt with your LLM API
104
+ ```
105
+
106
+ ## Next Steps
107
+
108
+ - See the [CLI Usage](./cli_usage.md) guide for command-line operations
109
+ - Explore [Advanced Features](./advanced_features.md) for templating and custom metrics
110
+ - Check [Integration Examples](./integration_examples.md) for real-world use cases
docs/integration_examples.md ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Integration Examples
2
+
3
+ This document provides concrete examples of integrating PromptLab into various applications and workflows.
4
+
5
+ ## Customer Support Chatbot
6
+
7
+ ### Setup
8
+
9
+ ```python
10
+ from promptlab import PromptManager, VersionControl
11
+ import openai
12
+
13
+ # Initialize components
14
+ prompt_manager = PromptManager()
15
+ version_control = VersionControl(prompt_manager)
16
+
17
+ # Create prompt templates for different scenarios
18
+ greeting_prompt = prompt_manager.create(
19
+ content="You are a helpful customer service agent for {company_name}. Greet the customer politely.",
20
+ name="Customer Greeting",
21
+ tags=["customer-service", "greeting"]
22
+ )
23
+
24
+ inquiry_prompt = prompt_manager.create(
25
+ content="""
26
+ You are a helpful customer service agent for {company_name}.
27
+ Customer inquiry: {customer_message}
28
+
29
+ Based on this inquiry:
30
+ 1. Identify the main issue
31
+ 2. Provide a helpful response
32
+ 3. Offer additional assistance
33
+
34
+ Keep your tone professional but friendly.
35
+ """,
36
+ name="Customer Inquiry Response",
37
+ tags=["customer-service", "inquiry"]
38
+ )
39
+
40
+ # Version them
41
+ version_control.commit(greeting_prompt.id, "Initial version")
42
+ version_control.commit(inquiry_prompt.id, "Initial version")
43
+
44
+ # OpenAI callback
45
+ def generate_response(prompt_text):
46
+ response = openai.ChatCompletion.create(
47
+ model="gpt-3.5-turbo",
48
+ messages=[{"role": "user", "content": prompt_text}]
49
+ )
50
+ return response.choices[0].message.content
51
+
52
+ # Main handler function
53
+ def handle_customer_message(customer_name, message, is_new_conversation):
54
+ if is_new_conversation:
55
+ # Use greeting prompt for new conversations
56
+ prompt = prompt_manager.get(greeting_prompt.id)
57
+ prompt_text = prompt.render(company_name="Acme Inc.")
58
+ return generate_response(prompt_text)
59
+ else:
60
+ # Use inquiry prompt for ongoing conversations
61
+ prompt = prompt_manager.get(inquiry_prompt.id)
62
+ prompt_text = prompt.render(
63
+ company_name="Acme Inc.",
64
+ customer_message=message
65
+ )
66
+ return generate_response(prompt_text)
67
+ ```
68
+
69
+ ## Content Generation System
70
+
71
+ ### Setup
72
+
73
+ ```python
74
+ from promptlab import PromptManager, PromptTesting, Evaluator
75
+ import asyncio
76
+
77
+ # Initialize components
78
+ prompt_manager = PromptManager("content_system_prompts")
79
+ testing = PromptTesting(prompt_manager)
80
+ evaluator = Evaluator(prompt_manager)
81
+
82
+ # Create content generation prompt
83
+ blog_prompt = prompt_manager.create(
84
+ content="""
85
+ Write a blog post about {topic}.
86
+
87
+ Title: {title}
88
+
89
+ The post should:
90
+ - Be approximately {word_count} words
91
+ - Be written in a {tone} tone
92
+ - Include {num_sections} main sections
93
+ - Target audience: {audience}
94
+ - Include a compelling call-to-action at the end
95
+
96
+ Keywords to include: {keywords}
97
+ """,
98
+ name="Blog Post Generator",
99
+ tags=["content", "blog"]
100
+ )
101
+
102
+ # Test cases
103
+ test_case = testing.create_test_case(
104
+ prompt_id=blog_prompt.id,
105
+ input_vars={
106
+ "topic": "Sustainable Living",
107
+ "title": "10 Simple Ways to Reduce Your Carbon Footprint",
108
+ "word_count": "800",
109
+ "tone": "informative yet casual",
110
+ "num_sections": "5",
111
+ "audience": "environmentally-conscious millennials",
112
+ "keywords": "sustainability, eco-friendly, carbon footprint, climate change, lifestyle changes"
113
+ }
114
+ )
115
+
116
+ # LLM callback
117
+ async def content_llm_callback(prompt, vars):
118
+ # Call your preferred LLM API here
119
+ # This is a placeholder
120
+ return f"Generated content about {vars.get('topic', 'unknown topic')}"
121
+
122
+ # Content generation function
123
+ async def generate_content(content_type, parameters):
124
+ if content_type == "blog":
125
+ prompt = prompt_manager.get(blog_prompt.id)
126
+ rendered_prompt = prompt.render(**parameters)
127
+
128
+ # Generate content
129
+ content = await content_llm_callback(rendered_prompt, parameters)
130
+
131
+ # Evaluate quality
132
+ evaluation = await evaluator.evaluate_prompt(
133
+ prompt_id=blog_prompt.id,
134
+ inputs=[parameters],
135
+ llm_callback=content_llm_callback
136
+ )
137
+
138
+ quality_score = evaluation["aggregated_metrics"].get("length", 0)
139
+
140
+ return {
141
+ "content": content,
142
+ "quality_score": quality_score,
143
+ "metadata": {
144
+ "prompt_id": blog_prompt.id,
145
+ "prompt_version": prompt.version,
146
+ "parameters": parameters
147
+ }
148
+ }
149
+ else:
150
+ raise ValueError(f"Unsupported content type: {content_type}")
151
+ ```
152
+
153
+ ## AI-Assisted Research Tool
154
+
155
+ ### Setup
156
+
157
+ ```python
158
+ from promptlab import PromptManager, VersionControl
159
+ import json
160
+ import openai
161
+
162
+ # Initialize components
163
+ prompt_manager = PromptManager("research_prompts")
164
+ version_control = VersionControl(prompt_manager)
165
+
166
+ # Create research prompts
167
+ article_summary_prompt = prompt_manager.create(
168
+ content="""
169
+ Summarize the following research article:
170
+
171
+ Title: {article_title}
172
+ Abstract: {article_abstract}
173
+
174
+ Provide a summary that:
175
+ 1. Identifies the main research question
176
+ 2. Outlines the methodology
177
+ 3. Summarizes key findings
178
+ 4. Highlights limitations
179
+ 5. Explains the significance of the results
180
+
181
+ Keep the summary concise, approximately 250 words.
182
+ """,
183
+ name="Article Summarizer",
184
+ tags=["research", "summary"]
185
+ )
186
+
187
+ research_question_prompt = prompt_manager.create(
188
+ content="""
189
+ Based on the following information:
190
+
191
+ Research Area: {research_area}
192
+ Existing Knowledge: {existing_knowledge}
193
+ Observed Gap: {knowledge_gap}
194
+
195
+ Generate 5 potential research questions that:
196
+ 1. Address the identified knowledge gap
197
+ 2. Are specific and answerable
198
+ 3. Have theoretical or practical significance
199
+ 4. Can be investigated with available research methods
200
+ """,
201
+ name="Research Question Generator",
202
+ tags=["research", "question-generation"]
203
+ )
204
+
205
+ # Version control
206
+ version_control.commit(article_summary_prompt.id, "Initial version")
207
+ version_control.commit(research_question_prompt.id, "Initial version")
208
+
209
+ # OpenAI callback
210
+ def research_assistant(prompt_text):
211
+ response = openai.ChatCompletion.create(
212
+ model="gpt-4",
213
+ messages=[{"role": "user", "content": prompt_text}]
214
+ )
215
+ return response.choices[0].message.content
216
+
217
+ # Research functions
218
+ def summarize_article(article_title, article_abstract):
219
+ prompt = prompt_manager.get(article_summary_prompt.id)
220
+ prompt_text = prompt.render(
221
+ article_title=article_title,
222
+ article_abstract=article_abstract
223
+ )
224
+ return research_assistant(prompt_text)
225
+
226
+ def generate_research_questions(research_area, existing_knowledge, knowledge_gap):
227
+ prompt = prompt_manager.get(research_question_prompt.id)
228
+ prompt_text = prompt.render(
229
+ research_area=research_area,
230
+ existing_knowledge=existing_knowledge,
231
+ knowledge_gap=knowledge_gap
232
+ )
233
+ return research_assistant(prompt_text)
234
+
235
+ # Save results
236
+ def save_research_data(research_project, data_type, content):
237
+ # Save the data along with prompt metadata for reproducibility
238
+ if data_type == "summary":
239
+ prompt_id = article_summary_prompt.id
240
+ prompt = prompt_manager.get(prompt_id)
241
+ elif data_type == "questions":
242
+ prompt_id = research_question_prompt.id
243
+ prompt = prompt_manager.get(prompt_id)
244
+
245
+ research_data = {
246
+ "content": content,
247
+ "metadata": {
248
+ "prompt_id": prompt_id,
249
+ "prompt_version": prompt.version,
250
+ "timestamp": datetime.datetime.now().isoformat()
251
+ }
252
+ }
253
+
254
+ # Save to file (in real application, might save to database)
255
+ with open(f"{research_project}_{data_type}.json", "w") as f:
256
+ json.dump(research_data, f, indent=2)
257
+ ```
258
+
259
+ ## Educational Quiz Generator
260
+
261
+ ### Setup
262
+
263
+ ```python
264
+ from promptlab import PromptManager, PromptTemplate
265
+ import asyncio
266
+ import aiohttp
267
+
268
+ # Initialize components
269
+ prompt_manager = PromptManager("education_prompts")
270
+
271
+ # Quiz generation prompt
272
+ quiz_prompt = prompt_manager.create(
273
+ content="""
274
+ Generate a quiz on the topic of {topic} at a {difficulty_level} difficulty level.
275
+
276
+ The quiz should:
277
+ - Have {num_questions} multiple-choice questions
278
+ - Cover the following subtopics: {subtopics}
279
+ - Include {include_explanation} explanations for the correct answers
280
+ - Be appropriate for {grade_level} students
281
+
282
+ For each question, provide:
283
+ 1. The question text
284
+ 2. Four possible answers (A, B, C, D)
285
+ 3. The correct answer
286
+ {if include_explanation == "yes"}
287
+ 4. An explanation of why the answer is correct
288
+ {endif}
289
+
290
+ Format the output as valid JSON.
291
+ """,
292
+ name="Quiz Generator",
293
+ tags=["education", "quiz"]
294
+ )
295
+
296
+ # Quiz rendering template using advanced templating
297
+ render_template = PromptTemplate("""
298
+ <h1>{quiz_title}</h1>
299
+
300
+ <form id="quiz-form">
301
+ {for question in questions}
302
+ <div class="question">
303
+ <p><strong>Question {question.number}:</strong> {question.text}</p>
304
+ <ul style="list-style-type: none;">
305
+ {for option in question.options}
306
+ <li>
307
+ <input type="radio" name="q{question.number}" id="q{question.number}_{option.letter}" value="{option.letter}">
308
+ <label for="q{question.number}_{option.letter}">{option.letter}. {option.text}</label>
309
+ </li>
310
+ {endfor}
311
+ </ul>
312
+
313
+ {if show_answers}
314
+ <div class="answer">
315
+ <p><strong>Correct Answer:</strong> {question.correct_answer}</p>
316
+ {if question.has_explanation}
317
+ <p><strong>Explanation:</strong> {question.explanation}</p>
318
+ {endif}
319
+ </div>
320
+ {endif}
321
+ </div>
322
+ {endfor}
323
+
324
+ {if !show_answers}
325
+ <button type="submit">Submit Quiz</button>
326
+ {endif}
327
+ </form>
328
+ """)
329
+
330
+ # LLM callback
331
+ async def education_llm_callback(prompt, vars):
332
+ # This would call your LLM API
333
+ # Simulated response for this example
334
+ await asyncio.sleep(1) # Simulate API call
335
+ if "quiz" in prompt:
336
+ return """
337
+ {
338
+ "questions": [
339
+ {
340
+ "text": "What is the capital of France?",
341
+ "options": [
342
+ {"letter": "A", "text": "London"},
343
+ {"letter": "B", "text": "Berlin"},
344
+ {"letter": "C", "text": "Paris"},
345
+ {"letter": "D", "text": "Madrid"}
346
+ ],
347
+ "correct_answer": "C",
348
+ "explanation": "Paris is the capital and most populous city of France."
349
+ },
350
+ {
351
+ "text": "Who wrote 'Romeo and Juliet'?",
352
+ "options": [
353
+ {"letter": "A", "text": "Charles Dickens"},
354
+ {"letter": "B", "text": "William Shakespeare"},
355
+ {"letter": "C", "text": "Jane Austen"},
356
+ {"letter": "D", "text": "Mark Twain"}
357
+ ],
358
+ "correct_answer": "B",
359
+ "explanation": "William Shakespeare wrote 'Romeo and Juliet' around 1594-1596."
360
+ }
361
+ ]
362
+ }
363
+ """
364
+ return "Default response"
365
+
366
+ # Quiz generation function
367
+ async def generate_quiz(topic, difficulty, num_questions, grade_level, include_explanations=True):
368
+ prompt = prompt_manager.get(quiz_prompt.id)
369
+ rendered_prompt = prompt.render(
370
+ topic=topic,
371
+ difficulty_level=difficulty,
372
+ num_questions=num_questions,
373
+ subtopics=", ".join(["key concepts", "historical context", "practical applications"]),
374
+ include_explanation="yes" if include_explanations else "no",
375
+ grade_level=grade_level
376
+ )
377
+
378
+ # Get quiz content from LLM
379
+ quiz_json = await education_llm_callback(rendered_prompt, {})
380
+
381
+ # Parse JSON
382
+ quiz_data = json.loads(quiz_json)
383
+
384
+ # Prepare data for HTML template
385
+ template_data = {
386
+ "quiz_title": f"{topic} Quiz ({difficulty} Level)",
387
+ "questions": [],
388
+ "show_answers": False
389
+ }
390
+
391
+ # Format questions
392
+ for i, q in enumerate(quiz_data["questions"]):
393
+ question = {
394
+ "number": i + 1,
395
+ "text": q["text"],
396
+ "options": q["options"],
397
+ "correct_answer": q["correct_answer"],
398
+ "has_explanation": "explanation" in q,
399
+ "explanation": q.get("explanation", "")
400
+ }
401
+ template_data["questions"].append(question)
402
+
403
+ # Render HTML
404
+ return render_template.render(**template_data)
405
+ ```
406
+
407
+ ## Automated Coding Assistant
408
+
409
+ ### Setup
410
+
411
+ ```python
412
+ from promptlab import PromptManager, PromptTesting
413
+ import asyncio
414
+ import subprocess
415
+ import tempfile
416
+
417
+ # Initialize components
418
+ prompt_manager = PromptManager("coding_prompts")
419
+ testing = PromptTesting(prompt_manager)
420
+
421
+ # Create code generation prompts
422
+ function_prompt = prompt_manager.create(
423
+ content="""
424
+ Write a {language} function that solves the following problem:
425
+
426
+ {problem_description}
427
+
428
+ Function signature: {function_signature}
429
+
430
+ Requirements:
431
+ - The function should handle edge cases
432
+ - Include appropriate comments
433
+ - Follow {language} best practices
434
+ - Be optimized for {optimization_goal}
435
+
436
+ {if include_tests == "yes"}
437
+ Also include unit tests for the function.
438
+ {endif}
439
+ """,
440
+ name="Function Generator",
441
+ tags=["coding", "function"]
442
+ )
443
+
444
+ bug_fix_prompt = prompt_manager.create(
445
+ content="""
446
+ Debug the following {language} code which has an issue:
447
+
448
+ ```{language}
449
+ {buggy_code}
450
+ ```
451
+
452
+ Error message or problem description:
453
+ {error_description}
454
+
455
+ Please:
456
+ 1. Identify the issue
457
+ 2. Explain the root cause
458
+ 3. Provide a fixed version of the code
459
+ 4. Suggest how to prevent similar issues
460
+ """,
461
+ name="Bug Fix Assistant",
462
+ tags=["coding", "debugging"]
463
+ )
464
+
465
+ # LLM callback
466
+ async def coding_llm_callback(prompt, vars):
467
+ # This would call your LLM API
468
+ # Simplified example response
469
+ await asyncio.sleep(1)
470
+
471
+ if "function" in prompt:
472
+ return """
473
+ ```python
474
+ def find_max_subarray_sum(arr):
475
+ """
476
+ Finds the maximum sum of any contiguous subarray.
477
+ Uses Kadane's algorithm with O(n) time complexity.
478
+
479
+ Args:
480
+ arr: List of integers
481
+ Returns:
482
+ Maximum subarray sum
483
+ """
484
+ if not arr:
485
+ return 0
486
+
487
+ current_max = global_max = arr[0]
488
+
489
+ for num in arr[1:]:
490
+ current_max = max(num, current_max + num)
491
+ global_max = max(global_max, current_max)
492
+
493
+ return global_max
494
+
495
+ # Unit tests
496
+ def test_find_max_subarray_sum():
497
+ assert find_max_subarray_sum([]) == 0
498
+ assert find_max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]) == 6
499
+ assert find_max_subarray_sum([-1, -2, -3]) == -1
500
+ print("All tests passed!")
501
+ ```
502
+ """
503
+ elif "debug" in prompt:
504
+ return """
505
+ The issue is a classic off-by-one error in the loop boundary.
506
+
507
+ Root cause:
508
+ The loop is using `i <= len(arr)` which accesses an index that's out of bounds.
509
+
510
+ Fixed code:
511
+ ```python
512
+ def process_array(arr):
513
+ result = []
514
+ for i in range(len(arr)): # Changed from i <= len(arr)
515
+ result.append(arr[i] * 2)
516
+ return result
517
+ ```
518
+
519
+ Prevention:
520
+ - Remember that array indices are 0-based and go up to len(arr)-1
521
+ - Use range() or enumerate() when iterating through arrays by index
522
+ - Add bounds checking for critical operations
523
+ """
524
+
525
+ return "Default response"
526
+
527
+ # Function to test generated code
528
+ def test_generated_code(code, language):
529
+ """Test the generated code by running it in a safe environment."""
530
+ if language.lower() == "python":
531
+ with tempfile.NamedTemporaryFile(suffix=".py") as temp:
532
+ temp.write(code.encode())
533
+ temp.flush()
534
+
535
+ try:
536
+ result = subprocess.run(["python", temp.name],
537
+ capture_output=True,
538
+ text=True,
539
+ timeout=5)
540
+ if result.returncode == 0:
541
+ return {"success": True, "output": result.stdout}
542
+ else:
543
+ return {"success": False, "error": result.stderr}
544
+ except subprocess.TimeoutExpired:
545
+ return {"success": False, "error": "Code execution timed out"}
546
+
547
+ return {"success": False, "error": f"Testing not implemented for {language}"}
548
+
549
+ # Main coding assistant function
550
+ async def generate_function(problem, language="python", optimization_goal="readability", include_tests=True):
551
+ function_name = problem.lower().replace(" ", "_").replace("-", "_")
552
+ signature = f"def {function_name}(parameters):"
553
+
554
+ prompt = prompt_manager.get(function_prompt.id)
555
+ rendered_prompt = prompt.render(
556
+ language=language,
557
+ problem_description=problem,
558
+ function_signature=signature,
559
+ optimization_goal=optimization_goal,
560
+ include_tests="yes" if include_tests else "no"
561
+ )
562
+
563
+ # Get code from LLM
564
+ generated_code = await coding_llm_callback(rendered_prompt, {})
565
+
566
+ # Extract code from markdown if present
567
+ if "```" in generated_code:
568
+ code_blocks = re.findall(r"```(?:\w+)?\n(.+?)```", generated_code, re.DOTALL)
569
+ if code_blocks:
570
+ clean_code = code_blocks[0]
571
+ else:
572
+ clean_code = generated_code
573
+ else:
574
+ clean_code = generated_code
575
+
576
+ # Test the code
577
+ test_result = test_generated_code(clean_code, language)
578
+
579
+ return {
580
+ "code": clean_code,
581
+ "test_result": test_result,
582
+ "prompt_id": function_prompt.id
583
+ }
584
+ ```
promptlab/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PromptLab - A comprehensive LLM Prompt Management System
3
+
4
+ PromptLab is a Python library that provides tools for managing, versioning,
5
+ testing, and evaluating prompts for Large Language Models.
6
+
7
+ Features:
8
+ - Prompt management with versioning
9
+ - A/B testing for prompt optimization
10
+ - Evaluation framework with customizable metrics
11
+ - Command-line interface for easy integration
12
+ """
13
+
14
+ from .core.prompt_manager import PromptManager, Prompt
15
+ from .core.version_control import VersionControl, PromptVersion
16
+ from .core.testing import PromptTesting, TestCase, TestResult, ABTestResult
17
+ from .core.evaluation import Evaluator, EvaluationMetric, ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
18
+ from .utils.metrics import create_default_metrics_set
19
+ from .utils.templating import PromptTemplate, template_registry
20
+
21
+ __version__ = "0.1.0"
22
+ __all__ = [
23
+ "PromptManager",
24
+ "Prompt",
25
+ "VersionControl",
26
+ "PromptVersion",
27
+ "PromptTesting",
28
+ "TestCase",
29
+ "TestResult",
30
+ "ABTestResult",
31
+ "Evaluator",
32
+ "EvaluationMetric",
33
+ "ExactMatchMetric",
34
+ "ContainsKeywordsMetric",
35
+ "LengthMetric",
36
+ "create_default_metrics_set",
37
+ "PromptTemplate",
38
+ "template_registry"
39
+ ]
promptlab/cli/__init__.py ADDED
File without changes
promptlab/cli/commands.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import os
4
+ import json
5
+ from typing import List, Optional, Dict, Any
6
+ import asyncio
7
+
8
+ from ..core.prompt_manager import PromptManager
9
+ from ..core.version_control import VersionControl
10
+ from ..core.testing import PromptTesting
11
+ from ..core.evaluation import Evaluator, ContainsKeywordsMetric, LengthMetric
12
+
13
+
14
+ class CLI:
15
+ """Command-line interface for PromptLab."""
16
+ def __init__(self):
17
+ self.prompt_manager = PromptManager()
18
+ self.version_control = VersionControl(self.prompt_manager)
19
+ self.testing = PromptTesting(self.prompt_manager)
20
+ self.evaluator = Evaluator(self.prompt_manager)
21
+
22
+ self.parser = argparse.ArgumentParser(description="PromptLab - LLM Prompt Management System")
23
+ self._setup_commands()
24
+
25
+ def _setup_commands(self) -> None:
26
+ """Set up command-line arguments."""
27
+ subparsers = self.parser.add_subparsers(dest="command", help="Command")
28
+
29
+ # Prompt commands
30
+ prompt_parser = subparsers.add_parser("prompt", help="Prompt management")
31
+ prompt_subparsers = prompt_parser.add_subparsers(dest="subcommand", help="Prompt subcommand")
32
+
33
+ # Create prompt
34
+ create_parser = prompt_subparsers.add_parser("create", help="Create a new prompt")
35
+ create_parser.add_argument("name", help="Prompt name")
36
+ create_parser.add_argument("--content", help="Prompt content")
37
+ create_parser.add_argument("--file", help="File containing prompt content")
38
+ create_parser.add_argument("--description", help="Prompt description")
39
+ create_parser.add_argument("--tags", help="Comma-separated list of tags")
40
+
41
+ # List prompts
42
+ # List prompts
43
+ list_parser = prompt_subparsers.add_parser("list", help="List prompts")
44
+ list_parser.add_argument("--tags", help="Filter by comma-separated list of tags")
45
+
46
+ # Get prompt
47
+ get_parser = prompt_subparsers.add_parser("get", help="Get a prompt")
48
+ get_parser.add_argument("id", help="Prompt ID")
49
+
50
+ # Update prompt
51
+ update_parser = prompt_subparsers.add_parser("update", help="Update a prompt")
52
+ update_parser.add_argument("id", help="Prompt ID")
53
+ update_parser.add_argument("--content", help="New prompt content")
54
+ update_parser.add_argument("--file", help="File containing new prompt content")
55
+ update_parser.add_argument("--name", help="New prompt name")
56
+ update_parser.add_argument("--description", help="New prompt description")
57
+ update_parser.add_argument("--tags", help="New comma-separated list of tags")
58
+
59
+ # Delete prompt
60
+ delete_parser = prompt_subparsers.add_parser("delete", help="Delete a prompt")
61
+ delete_parser.add_argument("id", help="Prompt ID")
62
+
63
+ # Version control commands
64
+ version_parser = subparsers.add_parser("version", help="Version control")
65
+ version_subparsers = version_parser.add_subparsers(dest="subcommand", help="Version subcommand")
66
+
67
+ # Commit
68
+ commit_parser = version_subparsers.add_parser("commit", help="Create a new version")
69
+ commit_parser.add_argument("id", help="Prompt ID")
70
+ commit_parser.add_argument("--message", help="Commit message")
71
+
72
+ # List versions
73
+ list_versions_parser = version_subparsers.add_parser("list", help="List versions")
74
+ list_versions_parser.add_argument("id", help="Prompt ID")
75
+
76
+ # Checkout
77
+ checkout_parser = version_subparsers.add_parser("checkout", help="Checkout a version")
78
+ checkout_parser.add_argument("id", help="Prompt ID")
79
+ checkout_parser.add_argument("version", type=int, help="Version number")
80
+
81
+ # Diff
82
+ diff_parser = version_subparsers.add_parser("diff", help="Compare versions")
83
+ diff_parser.add_argument("id", help="Prompt ID")
84
+ diff_parser.add_argument("version1", type=int, help="First version")
85
+ diff_parser.add_argument("version2", type=int, help="Second version")
86
+
87
+ # Testing commands
88
+ test_parser = subparsers.add_parser("test", help="Testing")
89
+ test_subparsers = test_parser.add_subparsers(dest="subcommand", help="Test subcommand")
90
+
91
+ # Create test case
92
+ create_test_parser = test_subparsers.add_parser("create", help="Create a test case")
93
+ create_test_parser.add_argument("prompt_id", help="Prompt ID")
94
+ create_test_parser.add_argument("--input", help="JSON string of input variables")
95
+ create_test_parser.add_argument("--input-file", help="File containing JSON input variables")
96
+ create_test_parser.add_argument("--expected", help="Expected output")
97
+ create_test_parser.add_argument("--expected-file", help="File containing expected output")
98
+ create_test_parser.add_argument("--name", help="Test case name")
99
+ create_test_parser.add_argument("--description", help="Test case description")
100
+
101
+ # List test cases
102
+ list_tests_parser = test_subparsers.add_parser("list", help="List test cases")
103
+ list_tests_parser.add_argument("--prompt-id", help="Filter by prompt ID")
104
+
105
+ # Run test case
106
+ run_test_parser = test_subparsers.add_parser("run", help="Run a test case")
107
+ run_test_parser.add_argument("test_id", help="Test case ID")
108
+ run_test_parser.add_argument("--llm", help="LLM callback function to use")
109
+
110
+ # Run all test cases for a prompt
111
+ run_all_parser = test_subparsers.add_parser("run-all", help="Run all test cases for a prompt")
112
+ run_all_parser.add_argument("prompt_id", help="Prompt ID")
113
+ run_all_parser.add_argument("--llm", help="LLM callback function to use")
114
+
115
+ # A/B test
116
+ ab_test_parser = test_subparsers.add_parser("ab", help="Run an A/B test")
117
+ ab_test_parser.add_argument("prompt_a", help="Prompt A ID")
118
+ ab_test_parser.add_argument("prompt_b", help="Prompt B ID")
119
+ ab_test_parser.add_argument("--llm", help="LLM callback function to use")
120
+ ab_test_parser.add_argument("--test-cases", help="Comma-separated list of test case IDs")
121
+
122
+ # Evaluation commands
123
+ eval_parser = subparsers.add_parser("eval", help="Evaluation")
124
+ eval_subparsers = eval_parser.add_subparsers(dest="subcommand", help="Evaluation subcommand")
125
+
126
+ # List metrics
127
+ list_metrics_parser = eval_subparsers.add_parser("metrics", help="List evaluation metrics")
128
+
129
+ # Register metric
130
+ register_metric_parser = eval_subparsers.add_parser("register", help="Register a custom metric")
131
+ register_metric_parser.add_argument("name", help="Metric name")
132
+ register_metric_parser.add_argument("--keywords", help="Keywords for ContainsKeywordsMetric")
133
+ register_metric_parser.add_argument("--min-length", type=int, help="Minimum length for LengthMetric")
134
+ register_metric_parser.add_argument("--max-length", type=int, help="Maximum length for LengthMetric")
135
+ register_metric_parser.add_argument("--target-length", type=int, help="Target length for LengthMetric")
136
+
137
+ # Evaluate prompt
138
+ evaluate_parser = eval_subparsers.add_parser("run", help="Evaluate a prompt")
139
+ evaluate_parser.add_argument("prompt_id", help="Prompt ID")
140
+ evaluate_parser.add_argument("--inputs", help="JSON string of input variables list")
141
+ evaluate_parser.add_argument("--inputs-file", help="File containing JSON input variables list")
142
+ evaluate_parser.add_argument("--expected", help="JSON string of expected outputs list")
143
+ evaluate_parser.add_argument("--expected-file", help="File containing JSON expected outputs list")
144
+ evaluate_parser.add_argument("--metrics", help="Comma-separated list of metrics to use")
145
+ evaluate_parser.add_argument("--llm", help="LLM callback function to use")
146
+
147
+ def run(self, args: Optional[List[str]] = None) -> None:
148
+ """Run the CLI with the given arguments."""
149
+ args = self.parser.parse_args(args)
150
+
151
+ if not args.command:
152
+ self.parser.print_help()
153
+ return
154
+
155
+ # Handle commands
156
+ if args.command == "prompt":
157
+ self._handle_prompt_command(args)
158
+ elif args.command == "version":
159
+ self._handle_version_command(args)
160
+ elif args.command == "test":
161
+ self._handle_test_command(args)
162
+ elif args.command == "eval":
163
+ self._handle_eval_command(args)
164
+
165
+ def _handle_prompt_command(self, args) -> None:
166
+ """Handle prompt commands."""
167
+ if not args.subcommand:
168
+ return
169
+
170
+ if args.subcommand == "create":
171
+ # Get content from file or argument
172
+ content = ""
173
+ if args.file:
174
+ with open(args.file, "r") as f:
175
+ content = f.read()
176
+ elif args.content:
177
+ content = args.content
178
+ else:
179
+ print("Error: Must provide either --content or --file")
180
+ return
181
+
182
+ # Parse tags
183
+ tags = []
184
+ if args.tags:
185
+ tags = [tag.strip() for tag in args.tags.split(",")]
186
+
187
+ # Create prompt
188
+ prompt = self.prompt_manager.create(
189
+ content=content,
190
+ name=args.name,
191
+ description=args.description,
192
+ tags=tags
193
+ )
194
+
195
+ print(f"Created prompt with ID: {prompt.id}")
196
+
197
+ elif args.subcommand == "list":
198
+ # Parse tags
199
+ tags = None
200
+ if args.tags:
201
+ tags = [tag.strip() for tag in args.tags.split(",")]
202
+
203
+ # List prompts
204
+ prompts = self.prompt_manager.list(tags)
205
+
206
+ if not prompts:
207
+ print("No prompts found")
208
+ return
209
+
210
+ # Print prompts
211
+ print(f"Found {len(prompts)} prompts:")
212
+ for prompt in prompts:
213
+ tags_str = ", ".join(prompt.tags) if prompt.tags else ""
214
+ print(f"ID: {prompt.id} | Name: {prompt.name} | Tags: {tags_str}")
215
+
216
+ elif args.subcommand == "get":
217
+ # Get prompt
218
+ prompt = self.prompt_manager.get(args.id)
219
+
220
+ if not prompt:
221
+ print(f"Prompt with ID {args.id} not found")
222
+ return
223
+
224
+ # Print prompt
225
+ print(f"ID: {prompt.id}")
226
+ print(f"Name: {prompt.name}")
227
+ print(f"Description: {prompt.description}")
228
+ print(f"Tags: {', '.join(prompt.tags)}")
229
+ print(f"Version: {prompt.version}")
230
+ print(f"Created: {prompt.created_at}")
231
+ print(f"Updated: {prompt.updated_at}")
232
+ print("\nContent:")
233
+ print(prompt.content)
234
+
235
+ elif args.subcommand == "update":
236
+ # Get prompt
237
+ prompt = self.prompt_manager.get(args.id)
238
+
239
+ if not prompt:
240
+ print(f"Prompt with ID {args.id} not found")
241
+ return
242
+
243
+ # Update kwargs
244
+ kwargs = {}
245
+
246
+ if args.name:
247
+ kwargs["name"] = args.name
248
+
249
+ if args.description:
250
+ kwargs["description"] = args.description
251
+
252
+ if args.tags:
253
+ kwargs["tags"] = [tag.strip() for tag in args.tags.split(",")]
254
+
255
+ # Get content from file or argument
256
+ if args.file:
257
+ with open(args.file, "r") as f:
258
+ kwargs["content"] = f.read()
259
+ elif args.content:
260
+ kwargs["content"] = args.content
261
+
262
+ # Update prompt
263
+ prompt = self.prompt_manager.update(args.id, **kwargs)
264
+
265
+ print(f"Updated prompt with ID: {prompt.id}")
266
+
267
+ elif args.subcommand == "delete":
268
+ # Delete prompt
269
+ success = self.prompt_manager.delete(args.id)
270
+
271
+ if success:
272
+ print(f"Deleted prompt with ID: {args.id}")
273
+ else:
274
+ print(f"Prompt with ID {args.id} not found")
275
+
276
+ def _handle_version_command(self, args) -> None:
277
+ """Handle version control commands."""
278
+ if not args.subcommand:
279
+ return
280
+
281
+ if args.subcommand == "commit":
282
+ # Commit version
283
+ version = self.version_control.commit(
284
+ prompt_id=args.id,
285
+ commit_message=args.message
286
+ )
287
+
288
+ if not version:
289
+ print(f"Prompt with ID {args.id} not found")
290
+ return
291
+
292
+ print(f"Committed version {version.version} for prompt {args.id}")
293
+
294
+ elif args.subcommand == "list":
295
+ # List versions
296
+ versions = self.version_control.list_versions(args.id)
297
+
298
+ if not versions:
299
+ print(f"No versions found for prompt {args.id}")
300
+ return
301
+
302
+ # Print versions
303
+ print(f"Found {len(versions)} versions for prompt {args.id}:")
304
+ for version in versions:
305
+ message = version.commit_message or "No commit message"
306
+ print(f"Version: {version.version} | Created: {version.created_at} | Message: {message}")
307
+
308
+ elif args.subcommand == "checkout":
309
+ # Checkout version
310
+ prompt = self.version_control.checkout(
311
+ prompt_id=args.id,
312
+ version=args.version
313
+ )
314
+
315
+ if not prompt:
316
+ print(f"Prompt with ID {args.id} or version {args.version} not found")
317
+ return
318
+
319
+ print(f"Checked out version {args.version} for prompt {args.id}")
320
+
321
+ elif args.subcommand == "diff":
322
+ # Diff versions
323
+ diff = self.version_control.diff(
324
+ prompt_id=args.id,
325
+ version1=args.version1,
326
+ version2=args.version2
327
+ )
328
+
329
+ if not diff:
330
+ print(f"Could not compare versions {args.version1} and {args.version2} for prompt {args.id}")
331
+ return
332
+
333
+ # Print diff
334
+ print(f"Diff between version {args.version1} and {args.version2} for prompt {args.id}:")
335
+ for line in diff["diff"]:
336
+ print(line)
337
+
338
+ def _handle_test_command(self, args) -> None:
339
+ """Handle testing commands."""
340
+ if not args.subcommand:
341
+ return
342
+
343
+ if args.subcommand == "create":
344
+ # Parse input variables
345
+ input_vars = {}
346
+ if args.input:
347
+ input_vars = json.loads(args.input)
348
+ elif args.input_file:
349
+ with open(args.input_file, "r") as f:
350
+ input_vars = json.loads(f.read())
351
+ else:
352
+ print("Error: Must provide either --input or --input-file")
353
+ return
354
+
355
+ # Parse expected output
356
+ expected = None
357
+ if args.expected:
358
+ expected = args.expected
359
+ elif args.expected_file:
360
+ with open(args.expected_file, "r") as f:
361
+ expected = f.read()
362
+
363
+ # Create test case
364
+ test_case = self.testing.create_test_case(
365
+ prompt_id=args.prompt_id,
366
+ input_vars=input_vars,
367
+ expected_output=expected,
368
+ name=args.name,
369
+ description=args.description
370
+ )
371
+
372
+ print(f"Created test case with ID: {test_case.id}")
373
+
374
+ elif args.subcommand == "list":
375
+ # List test cases
376
+ test_cases = self.testing.list_test_cases(args.prompt_id)
377
+
378
+ if not test_cases:
379
+ print("No test cases found")
380
+ return
381
+
382
+ # Print test cases
383
+ print(f"Found {len(test_cases)} test cases:")
384
+ for tc in test_cases:
385
+ print(f"ID: {tc.id} | Name: {tc.name} | Prompt ID: {tc.prompt_id}")
386
+
387
+ elif args.subcommand == "run":
388
+ # Get LLM callback
389
+ llm_callback = self._get_llm_callback(args.llm)
390
+
391
+ # Run test case
392
+ asyncio.run(self._run_test_case(args.test_id, llm_callback))
393
+
394
+ elif args.subcommand == "run-all":
395
+ # Get LLM callback
396
+ llm_callback = self._get_llm_callback(args.llm)
397
+
398
+ # Run all test cases
399
+ asyncio.run(self._run_all_test_cases(args.prompt_id, llm_callback))
400
+
401
+ elif args.subcommand == "ab":
402
+ # Get LLM callback
403
+ llm_callback = self._get_llm_callback(args.llm)
404
+
405
+ # Parse test case IDs
406
+ test_cases = None
407
+ if args.test_cases:
408
+ test_cases = [tc.strip() for tc in args.test_cases.split(",")]
409
+
410
+ # Run A/B test
411
+ asyncio.run(self._run_ab_test(args.prompt_a, args.prompt_b, llm_callback, test_cases))
412
+
413
+ async def _run_test_case(self, test_case_id, llm_callback) -> None:
414
+ """Run a test case."""
415
+ try:
416
+ metrics_callbacks = [
417
+ self._create_metrics_callback("exact_match"),
418
+ self._create_metrics_callback("similarity"),
419
+ self._create_metrics_callback("length")
420
+ ]
421
+
422
+ result = await self.testing.run_test_case(
423
+ test_case_id=test_case_id,
424
+ llm_callback=llm_callback,
425
+ metrics_callbacks=metrics_callbacks
426
+ )
427
+
428
+ print(f"Test result ID: {result.id}")
429
+ print(f"Test case ID: {result.test_case_id}")
430
+ print(f"Prompt ID: {result.prompt_id}")
431
+ print(f"Prompt version: {result.prompt_version}")
432
+ print(f"Passed: {result.passed}")
433
+
434
+ if result.metrics:
435
+ print("\nMetrics:")
436
+ for name, value in result.metrics.items():
437
+ print(f"{name}: {value}")
438
+
439
+ print("\nOutput:")
440
+ print(result.output)
441
+ except Exception as e:
442
+ print(f"Error running test case: {e}")
443
+
444
+ async def _run_all_test_cases(self, prompt_id, llm_callback) -> None:
445
+ """Run all test cases for a prompt."""
446
+ try:
447
+ metrics_callbacks = [
448
+ self._create_metrics_callback("exact_match"),
449
+ self._create_metrics_callback("similarity"),
450
+ self._create_metrics_callback("length")
451
+ ]
452
+
453
+ results = await self.testing.run_test_cases(
454
+ prompt_id=prompt_id,
455
+ llm_callback=llm_callback,
456
+ metrics_callbacks=metrics_callbacks
457
+ )
458
+
459
+ print(f"Ran {len(results)} test cases for prompt {prompt_id}")
460
+
461
+ # Calculate aggregate metrics
462
+ if results:
463
+ passed = sum(1 for r in results if r.passed)
464
+ print(f"Passed: {passed}/{len(results)} ({passed/len(results)*100:.2f}%)")
465
+
466
+ # Aggregate metrics
467
+ metrics = {}
468
+ for r in results:
469
+ for name, value in r.metrics.items():
470
+ if name not in metrics:
471
+ metrics[name] = []
472
+ metrics[name].append(value)
473
+
474
+ print("\nAggregate metrics:")
475
+ for name, values in metrics.items():
476
+ avg = sum(values) / len(values)
477
+ print(f"{name}: {avg:.4f}")
478
+ except Exception as e:
479
+ print(f"Error running test cases: {e}")
480
+
481
+ async def _run_ab_test(self, prompt_a_id, prompt_b_id, llm_callback, test_cases) -> None:
482
+ """Run an A/B test."""
483
+ try:
484
+ metrics_callbacks = [
485
+ self._create_metrics_callback("exact_match"),
486
+ self._create_metrics_callback("similarity"),
487
+ self._create_metrics_callback("length")
488
+ ]
489
+
490
+ result = await self.testing.run_ab_test(
491
+ prompt_a_id=prompt_a_id,
492
+ prompt_b_id=prompt_b_id,
493
+ llm_callback=llm_callback,
494
+ metrics_callbacks=metrics_callbacks,
495
+ test_cases=test_cases
496
+ )
497
+
498
+ print(f"A/B test result ID: {result.id}")
499
+ print(f"Prompt A ID: {result.prompt_a_id}")
500
+ print(f"Prompt B ID: {result.prompt_b_id}")
501
+ print(f"Winner: {result.winner or 'Tie'}")
502
+
503
+ print("\nPrompt A metrics:")
504
+ for name, value in result.metrics_a.items():
505
+ print(f"{name}: {value:.4f}")
506
+
507
+ print("\nPrompt B metrics:")
508
+ for name, value in result.metrics_b.items():
509
+ print(f"{name}: {value:.4f}")
510
+ except Exception as e:
511
+ print(f"Error running A/B test: {e}")
512
+
513
+ def _handle_eval_command(self, args) -> None:
514
+ """Handle evaluation commands."""
515
+ if not args.subcommand:
516
+ return
517
+
518
+ if args.subcommand == "metrics":
519
+ # List metrics
520
+ metrics = self.evaluator.list_metrics()
521
+
522
+ if not metrics:
523
+ print("No metrics registered")
524
+ return
525
+
526
+ # Print metrics
527
+ print(f"Found {len(metrics)} metrics:")
528
+ for metric in metrics:
529
+ print(f"Name: {metric.name} | Description: {metric.description}")
530
+
531
+ elif args.subcommand == "register":
532
+ # Register custom metric
533
+ if args.keywords:
534
+ # Register ContainsKeywordsMetric
535
+ keywords = [k.strip() for k in args.keywords.split(",")]
536
+ metric = ContainsKeywordsMetric(keywords)
537
+ self.evaluator.register_metric(metric)
538
+ print(f"Registered ContainsKeywordsMetric with name: {metric.name}")
539
+ elif args.min_length is not None or args.max_length is not None or args.target_length is not None:
540
+ # Register LengthMetric
541
+ metric = LengthMetric(
542
+ min_length=args.min_length,
543
+ max_length=args.max_length,
544
+ target_length=args.target_length
545
+ )
546
+ self.evaluator.register_metric(metric)
547
+ print(f"Registered LengthMetric with name: {metric.name}")
548
+ else:
549
+ print("Error: Must provide either --keywords, --min-length, --max-length, or --target-length")
550
+
551
+ elif args.subcommand == "run":
552
+ # Parse inputs
553
+ inputs = []
554
+ if args.inputs:
555
+ inputs = json.loads(args.inputs)
556
+ elif args.inputs_file:
557
+ with open(args.inputs_file, "r") as f:
558
+ inputs = json.loads(f.read())
559
+ else:
560
+ print("Error: Must provide either --inputs or --inputs-file")
561
+ return
562
+
563
+ # Parse expected outputs
564
+ expected_outputs = None
565
+ if args.expected:
566
+ expected_outputs = json.loads(args.expected)
567
+ elif args.expected_file:
568
+ with open(args.expected_file, "r") as f:
569
+ expected_outputs = json.loads(f.read())
570
+
571
+ # Parse metrics
572
+ metric_names = None
573
+ if args.metrics:
574
+ metric_names = [m.strip() for m in args.metrics.split(",")]
575
+
576
+ # Get LLM callback
577
+ llm_callback = self._get_llm_callback(args.llm)
578
+
579
+ # Run evaluation
580
+ asyncio.run(self._run_evaluation(
581
+ args.prompt_id,
582
+ inputs,
583
+ expected_outputs,
584
+ metric_names,
585
+ llm_callback
586
+ ))
587
+
588
+ async def _run_evaluation(self, prompt_id, inputs, expected_outputs, metric_names, llm_callback) -> None:
589
+ """Run an evaluation."""
590
+ try:
591
+ result = await self.evaluator.evaluate_prompt(
592
+ prompt_id=prompt_id,
593
+ inputs=inputs,
594
+ llm_callback=llm_callback,
595
+ expected_outputs=expected_outputs,
596
+ metric_names=metric_names
597
+ )
598
+
599
+ print(f"Evaluated prompt {prompt_id} with {result['num_samples']} samples")
600
+
601
+ # Print aggregated metrics
602
+ print("\nAggregated metrics:")
603
+ for name, value in result["aggregated_metrics"].items():
604
+ print(f"{name}: {value:.4f}")
605
+
606
+ # Print individual results
607
+ print("\nIndividual results:")
608
+ for i, r in enumerate(result["individual_results"]):
609
+ print(f"\nSample {i+1}:")
610
+ print(f"Input: {json.dumps(r['input'])}")
611
+ print(f"Output: {r['output']}")
612
+ if r["expected"]:
613
+ print(f"Expected: {r['expected']}")
614
+
615
+ print("Metrics:")
616
+ for name, value in r["metrics"].items():
617
+ print(f"{name}: {value:.4f}")
618
+ except Exception as e:
619
+ print(f"Error running evaluation: {e}")
620
+
621
+ def _get_llm_callback(self, llm_name: Optional[str]) -> callable:
622
+ """Get an LLM callback function."""
623
+ # Default to a simple echo function for testing
624
+ if not llm_name or llm_name == "echo":
625
+ async def echo_callback(prompt, vars):
626
+ return f"Echo: {prompt}"
627
+ return echo_callback
628
+
629
+ # Add more LLM callbacks as needed
630
+ if llm_name == "openai":
631
+ # Example implementation using OpenAI
632
+ try:
633
+ import openai
634
+
635
+ async def openai_callback(prompt, vars):
636
+ response = await openai.Completion.acreate(
637
+ model="text-davinci-003",
638
+ prompt=prompt,
639
+ max_tokens=1000
640
+ )
641
+ return response.choices[0].text.strip()
642
+
643
+ return openai_callback
644
+ except ImportError:
645
+ print("Error: OpenAI package not installed. Run `pip install openai` to use this LLM.")
646
+ sys.exit(1)
647
+
648
+ # Add more LLM implementations as needed
649
+
650
+ print(f"Error: Unknown LLM callback: {llm_name}")
651
+ sys.exit(1)
652
+
653
+ def _create_metrics_callback(self, metric_type: str) -> callable:
654
+ """Create a metrics callback function."""
655
+ # Simple metrics
656
+ if metric_type == "exact_match":
657
+ def exact_match_callback(output, expected):
658
+ if not expected:
659
+ return {"exact_match": 0.0}
660
+ return {"exact_match": 1.0 if output.strip() == expected.strip() else 0.0}
661
+ return exact_match_callback
662
+
663
+ elif metric_type == "similarity":
664
+ from difflib import SequenceMatcher
665
+
666
+ def similarity_callback(output, expected):
667
+ if not expected:
668
+ return {"similarity": 0.0}
669
+ return {"similarity": SequenceMatcher(None, output, expected).ratio()}
670
+ return similarity_callback
671
+
672
+ elif metric_type == "length":
673
+ def length_callback(output, expected):
674
+ out_len = len(output)
675
+ if not expected:
676
+ return {"length": 1.0 if out_len > 0 else 0.0}
677
+
678
+ exp_len = len(expected)
679
+ if exp_len == 0:
680
+ return {"length": 1.0 if out_len == 0 else 0.0}
681
+
682
+ # Return score inversely proportional to the difference
683
+ ratio = min(out_len / exp_len, exp_len / out_len)
684
+ return {"length": ratio}
685
+ return length_callback
686
+
687
+ # Default no-op metric
688
+ return lambda output, expected: {}
689
+
690
+
691
+ def main():
692
+ """Main entry point for the CLI."""
693
+ CLI().run()
694
+
695
+
696
+ if __name__ == "__main__":
697
+ main()
promptlab/core/__init__.py ADDED
File without changes
promptlab/core/evaluation.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from typing import Dict, List, Optional, Any, Callable, Union, Awaitable
5
+ import asyncio
6
+ from .prompt_manager import PromptManager, Prompt
7
+
8
+ class EvaluationMetric:
9
+ """Base class for evaluation metrics."""
10
+ def __init__(self, name: str, description: Optional[str] = None):
11
+ self.name = name
12
+ self.description = description or ""
13
+
14
+ def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
15
+ """Compute the metric. Must be implemented by subclasses."""
16
+ raise NotImplementedError("Subclasses must implement compute method")
17
+
18
+ class ExactMatchMetric(EvaluationMetric):
19
+ """Evaluates exact match between generated and expected output."""
20
+ def __init__(self):
21
+ super().__init__("exact_match", "Exact match between generated and expected output")
22
+
23
+ def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
24
+ """Return 1.0 if generated matches expected exactly, 0.0 otherwise."""
25
+ if not expected_output:
26
+ return 0.0
27
+ return 1.0 if generated_output.strip() == expected_output.strip() else 0.0
28
+
29
+ class ContainsKeywordsMetric(EvaluationMetric):
30
+ """Evaluates if the generated output contains specified keywords."""
31
+ def __init__(self, keywords: List[str], case_sensitive: bool = False):
32
+ super().__init__(
33
+ "contains_keywords",
34
+ f"Check if output contains keywords: {', '.join(keywords)}"
35
+ )
36
+ self.keywords = keywords
37
+ self.case_sensitive = case_sensitive
38
+
39
+ def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
40
+ """Return percentage of keywords found in the output."""
41
+ if not self.keywords:
42
+ return 0.0
43
+
44
+ if not self.case_sensitive:
45
+ generated_output = generated_output.lower()
46
+ keywords = [k.lower() for k in self.keywords]
47
+ else:
48
+ keywords = self.keywords
49
+
50
+ matches = sum(1 for k in keywords if k in generated_output)
51
+ return matches / len(keywords)
52
+
53
+ class LengthMetric(EvaluationMetric):
54
+ """Evaluates if the generated output length is within the desired range."""
55
+ def __init__(self, min_length: Optional[int] = None, max_length: Optional[int] = None, target_length: Optional[int] = None):
56
+ description = "Evaluate output length"
57
+ if target_length is not None:
58
+ description = f"Evaluate if output length is close to {target_length} characters"
59
+ elif min_length is not None and max_length is not None:
60
+ description = f"Evaluate if output length is between {min_length} and {max_length} characters"
61
+ elif min_length is not None:
62
+ description = f"Evaluate if output length is at least {min_length} characters"
63
+ elif max_length is not None:
64
+ description = f"Evaluate if output length is at most {max_length} characters"
65
+
66
+ super().__init__("length", description)
67
+ self.min_length = min_length
68
+ self.max_length = max_length
69
+ self.target_length = target_length
70
+
71
+ def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
72
+ """Return score based on length conditions."""
73
+ length = len(generated_output)
74
+
75
+ if self.target_length is not None:
76
+ # Score inversely proportional to the distance from target
77
+ max_distance = self.target_length # Normalize to a max distance
78
+ distance = abs(length - self.target_length)
79
+ return max(0, 1 - (distance / max_distance))
80
+
81
+ # Check if within bounds
82
+ within_min = self.min_length is None or length >= self.min_length
83
+ within_max = self.max_length is None or length <= self.max_length
84
+
85
+ if within_min and within_max:
86
+ return 1.0
87
+ elif within_min and self.max_length:
88
+ # Over max length, calculate proportional penalty
89
+ return max(0, 1 - ((length - self.max_length) / self.max_length))
90
+ elif within_max and self.min_length:
91
+ # Under min length, calculate proportional penalty
92
+ return max(0, length / self.min_length)
93
+ return 0.0
94
+
95
+ class Evaluator:
96
+ """Manages evaluation metrics and evaluation runs."""
97
+ def __init__(self, prompt_manager: PromptManager):
98
+ self.prompt_manager = prompt_manager
99
+ self.metrics: Dict[str, EvaluationMetric] = {}
100
+ self.storage_path = os.path.join(prompt_manager.storage_path, "evaluations")
101
+ os.makedirs(self.storage_path, exist_ok=True)
102
+
103
+ # Register built-in metrics
104
+ self.register_metric(ExactMatchMetric())
105
+ self.register_metric(ContainsKeywordsMetric(["important", "critical", "necessary"]))
106
+ self.register_metric(LengthMetric(min_length=50, max_length=500))
107
+
108
+ def register_metric(self, metric: EvaluationMetric) -> None:
109
+ """Register a new evaluation metric."""
110
+ self.metrics[metric.name] = metric
111
+
112
+ def get_metric(self, name: str) -> Optional[EvaluationMetric]:
113
+ """Get a registered metric by name."""
114
+ return self.metrics.get(name)
115
+
116
+ def list_metrics(self) -> List[EvaluationMetric]:
117
+ """List all registered metrics."""
118
+ return list(self.metrics.values())
119
+
120
+ async def evaluate_prompt(
121
+ self,
122
+ prompt_id: str,
123
+ inputs: List[Dict[str, Any]],
124
+ llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
125
+ expected_outputs: Optional[List[Optional[str]]] = None,
126
+ metric_names: Optional[List[str]] = None
127
+ ) -> Dict[str, Any]:
128
+ """Evaluate a prompt with the given inputs and metrics."""
129
+ prompt = self.prompt_manager.get(prompt_id)
130
+ if not prompt:
131
+ raise ValueError(f"Prompt with ID {prompt_id} not found")
132
+
133
+ # Use all registered metrics if none specified
134
+ if not metric_names:
135
+ metrics_to_use = list(self.metrics.values())
136
+ else:
137
+ metrics_to_use = [self.get_metric(name) for name in metric_names if self.get_metric(name)]
138
+
139
+ if not metrics_to_use:
140
+ raise ValueError("No valid metrics specified")
141
+
142
+ # Ensure expected_outputs is the same length as inputs
143
+ if expected_outputs is None:
144
+ expected_outputs = [None] * len(inputs)
145
+ elif len(expected_outputs) != len(inputs):
146
+ raise ValueError("Expected outputs must be the same length as inputs")
147
+
148
+ results = []
149
+ for i, (input_vars, expected) in enumerate(zip(inputs, expected_outputs)):
150
+ # Render the prompt
151
+ rendered_prompt = prompt.render(**input_vars)
152
+
153
+ # Generate output
154
+ if asyncio.iscoroutinefunction(llm_callback):
155
+ output = await llm_callback(rendered_prompt, input_vars)
156
+ else:
157
+ output = llm_callback(rendered_prompt, input_vars)
158
+
159
+ # Compute metrics
160
+ metrics_results = {}
161
+ for metric in metrics_to_use:
162
+ metrics_results[metric.name] = metric.compute(output, expected, **input_vars)
163
+
164
+ results.append({
165
+ "input": input_vars,
166
+ "output": output,
167
+ "expected": expected,
168
+ "metrics": metrics_results
169
+ })
170
+
171
+ # Aggregate metrics
172
+ aggregated_metrics = {}
173
+ for metric in metrics_to_use:
174
+ values = [r["metrics"][metric.name] for r in results]
175
+ aggregated_metrics[metric.name] = sum(values) / len(values) if values else 0
176
+
177
+ evaluation_result = {
178
+ "prompt_id": prompt_id,
179
+ "prompt_version": prompt.version,
180
+ "num_samples": len(inputs),
181
+ "aggregated_metrics": aggregated_metrics,
182
+ "individual_results": results
183
+ }
184
+
185
+ # Save evaluation result
186
+ timestamp = datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-")
187
+ file_path = os.path.join(self.storage_path, f"eval_{prompt_id}_{timestamp}.json")
188
+ with open(file_path, "w") as f:
189
+ json.dump(evaluation_result, f, indent=2)
190
+
191
+ return evaluation_result
promptlab/core/prompt_manager.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import hashlib
4
+ import datetime
5
+ from typing import Dict, List, Optional, Union, Any
6
+
7
+ class Prompt:
8
+ def __init__(
9
+ self,
10
+ content: str,
11
+ name: str,
12
+ description: Optional[str] = None,
13
+ tags: Optional[List[str]] = None,
14
+ metadata: Optional[Dict[str, Any]] = None
15
+ ):
16
+ self.content = content
17
+ self.name = name
18
+ self.description = description or ""
19
+ self.tags = tags or []
20
+ self.metadata = metadata or {}
21
+ self.created_at = datetime.datetime.now().isoformat()
22
+ self.updated_at = self.created_at
23
+ self.id = self._generate_id()
24
+ self.version = 1
25
+
26
+ def _generate_id(self) -> str:
27
+ """Generate a unique ID based on content and name."""
28
+ unique_string = f"{self.name}:{self.content}:{self.created_at}"
29
+ return hashlib.md5(unique_string.encode()).hexdigest()[:10]
30
+
31
+ def update(self, content: Optional[str] = None, **kwargs) -> None:
32
+ """Update prompt attributes."""
33
+ if content is not None:
34
+ self.content = content
35
+
36
+ for key, value in kwargs.items():
37
+ if hasattr(self, key):
38
+ setattr(self, key, value)
39
+
40
+ self.updated_at = datetime.datetime.now().isoformat()
41
+
42
+ def to_dict(self) -> Dict[str, Any]:
43
+ """Convert prompt to dictionary."""
44
+ return {
45
+ "id": self.id,
46
+ "name": self.name,
47
+ "content": self.content,
48
+ "description": self.description,
49
+ "tags": self.tags,
50
+ "metadata": self.metadata,
51
+ "created_at": self.created_at,
52
+ "updated_at": self.updated_at,
53
+ "version": self.version
54
+ }
55
+
56
+ @classmethod
57
+ def from_dict(cls, data: Dict[str, Any]) -> "Prompt":
58
+ """Create prompt from dictionary."""
59
+ prompt = cls(
60
+ content=data["content"],
61
+ name=data["name"],
62
+ description=data.get("description", ""),
63
+ tags=data.get("tags", []),
64
+ metadata=data.get("metadata", {})
65
+ )
66
+ prompt.id = data["id"]
67
+ prompt.created_at = data["created_at"]
68
+ prompt.updated_at = data["updated_at"]
69
+ prompt.version = data["version"]
70
+ return prompt
71
+
72
+ def render(self, **kwargs) -> str:
73
+ """Render prompt with provided variables."""
74
+ rendered = self.content
75
+ for key, value in kwargs.items():
76
+ placeholder = f"{{{key}}}"
77
+ rendered = rendered.replace(placeholder, str(value))
78
+ return rendered
79
+
80
+
81
+ class PromptManager:
82
+ def __init__(self, storage_path: Optional[str] = None):
83
+ self.storage_path = storage_path or os.path.join(os.getcwd(), "promptlab_storage")
84
+ self.prompts: Dict[str, Prompt] = {}
85
+ self._ensure_storage_dir()
86
+ self._load_prompts()
87
+
88
+ def _ensure_storage_dir(self) -> None:
89
+ """Ensure storage directory exists."""
90
+ os.makedirs(self.storage_path, exist_ok=True)
91
+
92
+ def _load_prompts(self) -> None:
93
+ """Load prompts from storage."""
94
+ prompts_dir = os.path.join(self.storage_path, "prompts")
95
+ if not os.path.exists(prompts_dir):
96
+ os.makedirs(prompts_dir)
97
+ return
98
+
99
+ for filename in os.listdir(prompts_dir):
100
+ if filename.endswith(".json"):
101
+ with open(os.path.join(prompts_dir, filename), "r") as f:
102
+ prompt_data = json.load(f)
103
+ prompt = Prompt.from_dict(prompt_data)
104
+ self.prompts[prompt.id] = prompt
105
+
106
+ def _save_prompt(self, prompt: Prompt) -> None:
107
+ """Save prompt to storage."""
108
+ prompts_dir = os.path.join(self.storage_path, "prompts")
109
+ os.makedirs(prompts_dir, exist_ok=True)
110
+
111
+ prompt_path = os.path.join(prompts_dir, f"{prompt.id}.json")
112
+ with open(prompt_path, "w") as f:
113
+ json.dump(prompt.to_dict(), f, indent=2)
114
+
115
+ def create(
116
+ self,
117
+ content: str,
118
+ name: str,
119
+ description: Optional[str] = None,
120
+ tags: Optional[List[str]] = None,
121
+ metadata: Optional[Dict[str, Any]] = None
122
+ ) -> Prompt:
123
+ """Create a new prompt."""
124
+ prompt = Prompt(
125
+ content=content,
126
+ name=name,
127
+ description=description,
128
+ tags=tags,
129
+ metadata=metadata
130
+ )
131
+ self.prompts[prompt.id] = prompt
132
+ self._save_prompt(prompt)
133
+ return prompt
134
+
135
+ def get(self, prompt_id: str) -> Optional[Prompt]:
136
+ """Get prompt by ID."""
137
+ return self.prompts.get(prompt_id)
138
+
139
+ def update(self, prompt_id: str, **kwargs) -> Optional[Prompt]:
140
+ """Update prompt by ID."""
141
+ prompt = self.get(prompt_id)
142
+ if prompt:
143
+ prompt.update(**kwargs)
144
+ self._save_prompt(prompt)
145
+ return prompt
146
+
147
+ def delete(self, prompt_id: str) -> bool:
148
+ """Delete prompt by ID."""
149
+ if prompt_id in self.prompts:
150
+ del self.prompts[prompt_id]
151
+ prompt_path = os.path.join(self.storage_path, "prompts", f"{prompt_id}.json")
152
+ if os.path.exists(prompt_path):
153
+ os.remove(prompt_path)
154
+ return True
155
+ return False
156
+
157
+ def list(self, tags: Optional[List[str]] = None) -> List[Prompt]:
158
+ """List prompts, optionally filtered by tags."""
159
+ if tags:
160
+ return [p for p in self.prompts.values() if any(tag in p.tags for tag in tags)]
161
+ return list(self.prompts.values())
162
+
163
+ def search(self, query: str) -> List[Prompt]:
164
+ """Search prompts by name or content."""
165
+ query = query.lower()
166
+ return [
167
+ p for p in self.prompts.values()
168
+ if query in p.name.lower() or query in p.content.lower()
169
+ ]
promptlab/core/testing.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import datetime
5
+ import asyncio
6
+ from typing import Dict, List, Optional, Any, Callable, Union, Awaitable, Tuple
7
+ from .prompt_manager import Prompt, PromptManager
8
+
9
+ class TestCase:
10
+ """Represents a test case for a prompt."""
11
+ def __init__(
12
+ self,
13
+ prompt_id: str,
14
+ input_vars: Dict[str, Any],
15
+ expected_output: Optional[str] = None,
16
+ name: Optional[str] = None,
17
+ description: Optional[str] = None
18
+ ):
19
+ self.id = str(uuid.uuid4())[:10]
20
+ self.prompt_id = prompt_id
21
+ self.input_vars = input_vars
22
+ self.expected_output = expected_output
23
+ self.name = name or f"Test case {self.id}"
24
+ self.description = description or ""
25
+ self.created_at = datetime.datetime.now().isoformat()
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ """Convert test case to dictionary."""
29
+ return {
30
+ "id": self.id,
31
+ "prompt_id": self.prompt_id,
32
+ "input_vars": self.input_vars,
33
+ "expected_output": self.expected_output,
34
+ "name": self.name,
35
+ "description": self.description,
36
+ "created_at": self.created_at
37
+ }
38
+
39
+ @classmethod
40
+ def from_dict(cls, data: Dict[str, Any]) -> "TestCase":
41
+ """Create test case from dictionary."""
42
+ test_case = cls(
43
+ prompt_id=data["prompt_id"],
44
+ input_vars=data["input_vars"],
45
+ expected_output=data.get("expected_output"),
46
+ name=data.get("name"),
47
+ description=data.get("description")
48
+ )
49
+ test_case.id = data["id"]
50
+ test_case.created_at = data["created_at"]
51
+ return test_case
52
+
53
+
54
+ class TestResult:
55
+ """Represents the result of a test case execution."""
56
+ def __init__(
57
+ self,
58
+ test_case_id: str,
59
+ prompt_id: str,
60
+ prompt_version: int,
61
+ output: str,
62
+ passed: Optional[bool] = None,
63
+ metrics: Optional[Dict[str, float]] = None
64
+ ):
65
+ self.id = str(uuid.uuid4())[:10]
66
+ self.test_case_id = test_case_id
67
+ self.prompt_id = prompt_id
68
+ self.prompt_version = prompt_version
69
+ self.output = output
70
+ self.passed = passed
71
+ self.metrics = metrics or {}
72
+ self.created_at = datetime.datetime.now().isoformat()
73
+
74
+ def to_dict(self) -> Dict[str, Any]:
75
+ """Convert test result to dictionary."""
76
+ return {
77
+ "id": self.id,
78
+ "test_case_id": self.test_case_id,
79
+ "prompt_id": self.prompt_id,
80
+ "prompt_version": self.prompt_version,
81
+ "output": self.output,
82
+ "passed": self.passed,
83
+ "metrics": self.metrics,
84
+ "created_at": self.created_at
85
+ }
86
+
87
+ @classmethod
88
+ def from_dict(cls, data: Dict[str, Any]) -> "TestResult":
89
+ """Create test result from dictionary."""
90
+ return cls(
91
+ test_case_id=data["test_case_id"],
92
+ prompt_id=data["prompt_id"],
93
+ prompt_version=data["prompt_version"],
94
+ output=data["output"],
95
+ passed=data.get("passed"),
96
+ metrics=data.get("metrics", {})
97
+ )
98
+
99
+
100
+ class ABTestResult:
101
+ """Represents the result of an A/B test."""
102
+ def __init__(
103
+ self,
104
+ prompt_a_id: str,
105
+ prompt_b_id: str,
106
+ prompt_a_version: int,
107
+ prompt_b_version: int,
108
+ metrics_a: Dict[str, float],
109
+ metrics_b: Dict[str, float],
110
+ winner: Optional[str] = None
111
+ ):
112
+ self.id = str(uuid.uuid4())[:10]
113
+ self.prompt_a_id = prompt_a_id
114
+ self.prompt_b_id = prompt_b_id
115
+ self.prompt_a_version = prompt_a_version
116
+ self.prompt_b_version = prompt_b_version
117
+ self.metrics_a = metrics_a
118
+ self.metrics_b = metrics_b
119
+ self.winner = winner
120
+ self.created_at = datetime.datetime.now().isoformat()
121
+
122
+ def to_dict(self) -> Dict[str, Any]:
123
+ """Convert A/B test result to dictionary."""
124
+ return {
125
+ "id": self.id,
126
+ "prompt_a_id": self.prompt_a_id,
127
+ "prompt_b_id": self.prompt_b_id,
128
+ "prompt_a_version": self.prompt_a_version,
129
+ "prompt_b_version": self.prompt_b_version,
130
+ "metrics_a": self.metrics_a,
131
+ "metrics_b": self.metrics_b,
132
+ "winner": self.winner,
133
+ "created_at": self.created_at
134
+ }
135
+
136
+ @classmethod
137
+ def from_dict(cls, data: Dict[str, Any]) -> "ABTestResult":
138
+ """Create A/B test result from dictionary."""
139
+ return cls(
140
+ prompt_a_id=data["prompt_a_id"],
141
+ prompt_b_id=data["prompt_b_id"],
142
+ prompt_a_version=data["prompt_a_version"],
143
+ prompt_b_version=data["prompt_b_version"],
144
+ metrics_a=data["metrics_a"],
145
+ metrics_b=data["metrics_b"],
146
+ winner=data.get("winner")
147
+ )
148
+
149
+
150
+ class PromptTesting:
151
+ """Manages testing for prompts."""
152
+ def __init__(self, prompt_manager: PromptManager):
153
+ self.prompt_manager = prompt_manager
154
+ self.storage_path = os.path.join(prompt_manager.storage_path, "tests")
155
+ os.makedirs(self.storage_path, exist_ok=True)
156
+
157
+ # Storage paths
158
+ self.test_cases_path = os.path.join(self.storage_path, "test_cases")
159
+ self.test_results_path = os.path.join(self.storage_path, "test_results")
160
+ self.ab_test_results_path = os.path.join(self.storage_path, "ab_test_results")
161
+
162
+ os.makedirs(self.test_cases_path, exist_ok=True)
163
+ os.makedirs(self.test_results_path, exist_ok=True)
164
+ os.makedirs(self.ab_test_results_path, exist_ok=True)
165
+
166
+ self.test_cases: Dict[str, TestCase] = {}
167
+ self.test_results: Dict[str, TestResult] = {}
168
+ self.ab_test_results: Dict[str, ABTestResult] = {}
169
+
170
+ self._load_test_cases()
171
+ self._load_test_results()
172
+ self._load_ab_test_results()
173
+
174
+ def _load_test_cases(self) -> None:
175
+ """Load test cases from storage."""
176
+ for filename in os.listdir(self.test_cases_path):
177
+ if filename.endswith(".json"):
178
+ with open(os.path.join(self.test_cases_path, filename), "r") as f:
179
+ data = json.load(f)
180
+ test_case = TestCase.from_dict(data)
181
+ self.test_cases[test_case.id] = test_case
182
+
183
+ def _load_test_results(self) -> None:
184
+ """Load test results from storage."""
185
+ for filename in os.listdir(self.test_results_path):
186
+ if filename.endswith(".json"):
187
+ with open(os.path.join(self.test_results_path, filename), "r") as f:
188
+ data = json.load(f)
189
+ test_result = TestResult.from_dict(data)
190
+ self.test_results[test_result.id] = test_result
191
+
192
+ def _load_ab_test_results(self) -> None:
193
+ """Load A/B test results from storage."""
194
+ for filename in os.listdir(self.ab_test_results_path):
195
+ if filename.endswith(".json"):
196
+ with open(os.path.join(self.ab_test_results_path, filename), "r") as f:
197
+ data = json.load(f)
198
+ ab_test_result = ABTestResult.from_dict(data)
199
+ self.ab_test_results[ab_test_result.id] = ab_test_result
200
+
201
+ def _save_test_case(self, test_case: TestCase) -> None:
202
+ """Save test case to storage."""
203
+ file_path = os.path.join(self.test_cases_path, f"{test_case.id}.json")
204
+ with open(file_path, "w") as f:
205
+ json.dump(test_case.to_dict(), f, indent=2)
206
+
207
+ def _save_test_result(self, test_result: TestResult) -> None:
208
+ """Save test result to storage."""
209
+ file_path = os.path.join(self.test_results_path, f"{test_result.id}.json")
210
+ with open(file_path, "w") as f:
211
+ json.dump(test_result.to_dict(), f, indent=2)
212
+
213
+ def _save_ab_test_result(self, ab_test_result: ABTestResult) -> None:
214
+ """Save A/B test result to storage."""
215
+ file_path = os.path.join(self.ab_test_results_path, f"{ab_test_result.id}.json")
216
+ with open(file_path, "w") as f:
217
+ json.dump(ab_test_result.to_dict(), f, indent=2)
218
+
219
+ def create_test_case(
220
+ self,
221
+ prompt_id: str,
222
+ input_vars: Dict[str, Any],
223
+ expected_output: Optional[str] = None,
224
+ name: Optional[str] = None,
225
+ description: Optional[str] = None
226
+ ) -> TestCase:
227
+ """Create a test case for a prompt."""
228
+ test_case = TestCase(
229
+ prompt_id=prompt_id,
230
+ input_vars=input_vars,
231
+ expected_output=expected_output,
232
+ name=name,
233
+ description=description
234
+ )
235
+ self.test_cases[test_case.id] = test_case
236
+ self._save_test_case(test_case)
237
+ return test_case
238
+
239
+ def get_test_case(self, test_case_id: str) -> Optional[TestCase]:
240
+ """Get a test case by ID."""
241
+ return self.test_cases.get(test_case_id)
242
+
243
+ def list_test_cases(self, prompt_id: Optional[str] = None) -> List[TestCase]:
244
+ """List test cases, optionally filtered by prompt ID."""
245
+ if prompt_id:
246
+ return [tc for tc in self.test_cases.values() if tc.prompt_id == prompt_id]
247
+ return list(self.test_cases.values())
248
+
249
+ def delete_test_case(self, test_case_id: str) -> bool:
250
+ """Delete a test case by ID."""
251
+ if test_case_id in self.test_cases:
252
+ del self.test_cases[test_case_id]
253
+ file_path = os.path.join(self.test_cases_path, f"{test_case_id}.json")
254
+ if os.path.exists(file_path):
255
+ os.remove(file_path)
256
+ return True
257
+ return False
258
+
259
+ async def run_test_case(
260
+ self,
261
+ test_case_id: str,
262
+ llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
263
+ metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
264
+ ) -> TestResult:
265
+ """Run a test case with the given LLM callback."""
266
+ test_case = self.get_test_case(test_case_id)
267
+ if not test_case:
268
+ raise ValueError(f"Test case with ID {test_case_id} not found")
269
+
270
+ prompt = self.prompt_manager.get(test_case.prompt_id)
271
+ if not prompt:
272
+ raise ValueError(f"Prompt with ID {test_case.prompt_id} not found")
273
+
274
+ # Render the prompt with the input variables
275
+ rendered_prompt = prompt.render(**test_case.input_vars)
276
+
277
+ # Call the LLM with the rendered prompt
278
+ if asyncio.iscoroutinefunction(llm_callback):
279
+ output = await llm_callback(rendered_prompt, test_case.input_vars)
280
+ else:
281
+ output = llm_callback(rendered_prompt, test_case.input_vars)
282
+
283
+ # Determine if the test passed
284
+ passed = None
285
+ if test_case.expected_output:
286
+ passed = output.strip() == test_case.expected_output.strip()
287
+
288
+ # Calculate metrics if callbacks are provided
289
+ metrics = {}
290
+ if metrics_callbacks:
291
+ for metric_callback in metrics_callbacks:
292
+ metrics.update(metric_callback(output, test_case.expected_output or ""))
293
+
294
+ # Create and save the test result
295
+ test_result = TestResult(
296
+ test_case_id=test_case.id,
297
+ prompt_id=test_case.prompt_id,
298
+ prompt_version=prompt.version,
299
+ output=output,
300
+ passed=passed,
301
+ metrics=metrics
302
+ )
303
+ self.test_results[test_result.id] = test_result
304
+ self._save_test_result(test_result)
305
+
306
+ return test_result
307
+
308
+ async def run_test_cases(
309
+ self,
310
+ prompt_id: str,
311
+ llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
312
+ metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
313
+ ) -> List[TestResult]:
314
+ """Run all test cases for a prompt."""
315
+ test_cases = self.list_test_cases(prompt_id)
316
+ results = []
317
+
318
+ for test_case in test_cases:
319
+ result = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
320
+ results.append(result)
321
+
322
+ return results
323
+
324
+ async def run_ab_test(
325
+ self,
326
+ prompt_a_id: str,
327
+ prompt_b_id: str,
328
+ llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
329
+ metrics_callbacks: List[Callable[[str, str], Dict[str, float]]],
330
+ test_cases: Optional[List[str]] = None
331
+ ) -> ABTestResult:
332
+ """Run an A/B test with two prompts."""
333
+ prompt_a = self.prompt_manager.get(prompt_a_id)
334
+ prompt_b = self.prompt_manager.get(prompt_b_id)
335
+
336
+ if not prompt_a or not prompt_b:
337
+ raise ValueError("Both prompts must exist")
338
+
339
+ # Get test cases to use
340
+ if test_cases:
341
+ # Use specified test cases
342
+ test_case_objs = [self.get_test_case(tc_id) for tc_id in test_cases]
343
+ test_case_objs = [tc for tc in test_case_objs if tc]
344
+ else:
345
+ # Use all test cases for prompt A
346
+ test_case_objs = self.list_test_cases(prompt_a_id)
347
+
348
+ if not test_case_objs:
349
+ raise ValueError("No test cases found for the A/B test")
350
+
351
+ # Run test cases for both prompts
352
+ results_a = []
353
+ results_b = []
354
+
355
+ for test_case in test_case_objs:
356
+ # Create a copy of the test case for prompt B
357
+ if test_case.prompt_id != prompt_b_id:
358
+ test_case_b = self.create_test_case(
359
+ prompt_id=prompt_b_id,
360
+ input_vars=test_case.input_vars,
361
+ expected_output=test_case.expected_output,
362
+ name=f"Copy of {test_case.name} for B",
363
+ description=test_case.description
364
+ )
365
+ else:
366
+ test_case_b = test_case
367
+
368
+ # Run the test cases
369
+ result_a = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
370
+ result_b = await self.run_test_case(test_case_b.id, llm_callback, metrics_callbacks)
371
+
372
+ results_a.append(result_a)
373
+ results_b.append(result_b)
374
+
375
+ # Calculate aggregate metrics
376
+ metrics_a = self._aggregate_metrics([r.metrics for r in results_a])
377
+ metrics_b = self._aggregate_metrics([r.metrics for r in results_b])
378
+
379
+ # Determine winner
380
+ winner = self._determine_winner(metrics_a, metrics_b)
381
+
382
+ # Create and save the A/B test result
383
+ ab_test_result = ABTestResult(
384
+ prompt_a_id=prompt_a_id,
385
+ prompt_b_id=prompt_b_id,
386
+ prompt_a_version=prompt_a.version,
387
+ prompt_b_version=prompt_b.version,
388
+ metrics_a=metrics_a,
389
+ metrics_b=metrics_b,
390
+ winner=winner
391
+ )
392
+ self.ab_test_results[ab_test_result.id] = ab_test_result
393
+ self._save_ab_test_result(ab_test_result)
394
+
395
+ return ab_test_result
396
+
397
+ def _aggregate_metrics(self, metrics_list: List[Dict[str, float]]) -> Dict[str, float]:
398
+ """Aggregate metrics from multiple test results."""
399
+ if not metrics_list:
400
+ return {}
401
+
402
+ aggregated = {}
403
+ for key in metrics_list[0].keys():
404
+ values = [m.get(key, 0) for m in metrics_list]
405
+ aggregated[key] = sum(values) / len(values) # Simple average
406
+
407
+ return aggregated
408
+
409
+ def _determine_winner(self, metrics_a: Dict[str, float], metrics_b: Dict[str, float]) -> Optional[str]:
410
+ """Determine winner of A/B test based on metrics."""
411
+ if not metrics_a or not metrics_b:
412
+ return None
413
+
414
+ # Assume higher values are better for all metrics
415
+ a_wins = 0
416
+ b_wins = 0
417
+
418
+ for key in metrics_a.keys():
419
+ if key in metrics_b:
420
+ if metrics_a[key] > metrics_b[key]:
421
+ a_wins += 1
422
+ elif metrics_b[key] > metrics_a[key]:
423
+ b_wins += 1
424
+
425
+ if a_wins > b_wins:
426
+ return "A"
427
+ elif b_wins > a_wins:
428
+ return "B"
429
+ else:
430
+ return None # Tie
431
+
432
+ def get_test_results(self, test_case_id: Optional[str] = None, prompt_id: Optional[str] = None) -> List[TestResult]:
433
+ """Get test results, optionally filtered by test case ID or prompt ID."""
434
+ results = list(self.test_results.values())
435
+
436
+ if test_case_id:
437
+ results = [r for r in results if r.test_case_id == test_case_id]
438
+
439
+ if prompt_id:
440
+ results = [r for r in results if r.prompt_id == prompt_id]
441
+
442
+ return sorted(results, key=lambda r: r.created_at, reverse=True)
443
+
444
+ def get_ab_test_results(self, prompt_id: Optional[str] = None) -> List[ABTestResult]:
445
+ """Get A/B test results, optionally filtered by prompt ID."""
446
+ results = list(self.ab_test_results.values())
447
+
448
+ if prompt_id:
449
+ results = [r for r in results if r.prompt_a_id == prompt_id or r.prompt_b_id == prompt_id]
450
+
451
+ return sorted(results, key=lambda r: r.created_at, reverse=True)
promptlab/core/version_control.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from typing import Dict, List, Optional, Any
5
+ from .prompt_manager import Prompt, PromptManager
6
+
7
+ class PromptVersion:
8
+ """Represents a specific version of a prompt."""
9
+ def __init__(
10
+ self,
11
+ prompt_id: str,
12
+ version: int,
13
+ content: str,
14
+ metadata: Optional[Dict[str, Any]] = None,
15
+ commit_message: Optional[str] = None
16
+ ):
17
+ self.prompt_id = prompt_id
18
+ self.version = version
19
+ self.content = content
20
+ self.metadata = metadata or {}
21
+ self.commit_message = commit_message or ""
22
+ self.created_at = datetime.datetime.now().isoformat()
23
+
24
+ def to_dict(self) -> Dict[str, Any]:
25
+ """Convert version to dictionary."""
26
+ return {
27
+ "prompt_id": self.prompt_id,
28
+ "version": self.version,
29
+ "content": self.content,
30
+ "metadata": self.metadata,
31
+ "commit_message": self.commit_message,
32
+ "created_at": self.created_at
33
+ }
34
+
35
+ @classmethod
36
+ def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion":
37
+ """Create version from dictionary."""
38
+ return cls(
39
+ prompt_id=data["prompt_id"],
40
+ version=data["version"],
41
+ content=data["content"],
42
+ metadata=data.get("metadata", {}),
43
+ commit_message=data.get("commit_message", "")
44
+ )
45
+
46
+
47
+ class VersionControl:
48
+ """Manages versioning for prompts."""
49
+ def __init__(self, prompt_manager: PromptManager):
50
+ self.prompt_manager = prompt_manager
51
+ self.storage_path = os.path.join(prompt_manager.storage_path, "versions")
52
+ os.makedirs(self.storage_path, exist_ok=True)
53
+ self.versions: Dict[str, Dict[int, PromptVersion]] = {}
54
+ self._load_versions()
55
+
56
+ def _load_versions(self) -> None:
57
+ """Load versions from storage."""
58
+ if not os.path.exists(self.storage_path):
59
+ os.makedirs(self.storage_path)
60
+ return
61
+
62
+ for prompt_id_dir in os.listdir(self.storage_path):
63
+ prompt_dir = os.path.join(self.storage_path, prompt_id_dir)
64
+ if os.path.isdir(prompt_dir):
65
+ self.versions[prompt_id_dir] = {}
66
+
67
+ for filename in os.listdir(prompt_dir):
68
+ if filename.endswith(".json"):
69
+ with open(os.path.join(prompt_dir, filename), "r") as f:
70
+ version_data = json.load(f)
71
+ version = PromptVersion.from_dict(version_data)
72
+ self.versions[prompt_id_dir][version.version] = version
73
+
74
+ def _save_version(self, version: PromptVersion) -> None:
75
+ """Save version to storage."""
76
+ prompt_dir = os.path.join(self.storage_path, version.prompt_id)
77
+ os.makedirs(prompt_dir, exist_ok=True)
78
+
79
+ version_path = os.path.join(prompt_dir, f"v{version.version}.json")
80
+ with open(version_path, "w") as f:
81
+ json.dump(version.to_dict(), f, indent=2)
82
+
83
+ def commit(
84
+ self,
85
+ prompt_id: str,
86
+ commit_message: Optional[str] = None,
87
+ metadata: Optional[Dict[str, Any]] = None
88
+ ) -> Optional[PromptVersion]:
89
+ """Create a new version of a prompt."""
90
+ prompt = self.prompt_manager.get(prompt_id)
91
+ if not prompt:
92
+ return None
93
+
94
+ # Initialize versions dict for this prompt if it doesn't exist
95
+ if prompt_id not in self.versions:
96
+ self.versions[prompt_id] = {}
97
+
98
+ # Get the highest version number for this prompt
99
+ current_versions = self.versions.get(prompt_id, {})
100
+ next_version = max(current_versions.keys(), default=0) + 1
101
+
102
+ # Create the new version
103
+ version = PromptVersion(
104
+ prompt_id=prompt_id,
105
+ version=next_version,
106
+ content=prompt.content,
107
+ metadata=metadata or {},
108
+ commit_message=commit_message
109
+ )
110
+
111
+ # Save the new version
112
+ self.versions[prompt_id][next_version] = version
113
+ self._save_version(version)
114
+
115
+ # Update the prompt's version number
116
+ prompt.version = next_version
117
+ self.prompt_manager._save_prompt(prompt)
118
+
119
+ return version
120
+
121
+ def get_version(self, prompt_id: str, version: int) -> Optional[PromptVersion]:
122
+ """Get a specific version of a prompt."""
123
+ return self.versions.get(prompt_id, {}).get(version)
124
+
125
+ def list_versions(self, prompt_id: str) -> List[PromptVersion]:
126
+ """List all versions of a prompt."""
127
+ versions = self.versions.get(prompt_id, {})
128
+ return sorted(versions.values(), key=lambda v: v.version)
129
+
130
+ def checkout(self, prompt_id: str, version: int) -> Optional[Prompt]:
131
+ """Checkout a specific version of a prompt."""
132
+ prompt = self.prompt_manager.get(prompt_id)
133
+ version_obj = self.get_version(prompt_id, version)
134
+
135
+ if not prompt or not version_obj:
136
+ return None
137
+
138
+ prompt.content = version_obj.content
139
+ prompt.version = version
140
+ prompt.updated_at = datetime.datetime.now().isoformat()
141
+
142
+ self.prompt_manager._save_prompt(prompt)
143
+ return prompt
144
+
145
+ def diff(self, prompt_id: str, version1: int, version2: int) -> Dict[str, Any]:
146
+ """Compare two versions of a prompt."""
147
+ v1 = self.get_version(prompt_id, version1)
148
+ v2 = self.get_version(prompt_id, version2)
149
+
150
+ if not v1 or not v2:
151
+ return {}
152
+
153
+ import difflib
154
+ d = difflib.Differ()
155
+ diff = list(d.compare(v1.content.splitlines(), v2.content.splitlines()))
156
+
157
+ return {
158
+ "version1": version1,
159
+ "version2": version2,
160
+ "diff": diff
161
+ }
promptlab/examples/__init__.py ADDED
File without changes
promptlab/examples/ab_testing.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A/B testing example for PromptLab.
3
+
4
+ This example demonstrates how to use PromptLab to perform A/B testing
5
+ on different prompt variations to find the most effective one.
6
+ """
7
+
8
+ import asyncio
9
+ import os
10
+ from promptlab import PromptManager, PromptTesting
11
+
12
+ async def llm_callback(prompt, vars):
13
+ """
14
+ Simulated LLM callback for testing.
15
+
16
+ In a real scenario, this would call an actual LLM API.
17
+ """
18
+ # Simple simulation - return different responses based on prompt content
19
+ if "concise" in prompt.lower():
20
+ return "This is a short, concise response."
21
+ elif "detailed" in prompt.lower():
22
+ return "This is a much more detailed response that provides additional context and information about the query. It elaborates on various aspects and provides a comprehensive answer."
23
+ else:
24
+ return "Default response."
25
+
26
+ async def main():
27
+ # Initialize the prompt manager with a custom storage path
28
+ storage_path = os.path.join(os.getcwd(), "promptlab_storage")
29
+ prompt_manager = PromptManager(storage_path)
30
+
31
+ # Initialize testing
32
+ testing = PromptTesting(prompt_manager)
33
+
34
+ # Create two prompt variations
35
+ prompt_a = prompt_manager.create(
36
+ content="Provide a concise answer to the following question: {question}",
37
+ name="Concise Prompt",
38
+ description="A prompt that asks for concise answers",
39
+ tags=["concise", "test"]
40
+ )
41
+
42
+ prompt_b = prompt_manager.create(
43
+ content="Provide a detailed and comprehensive answer to the following question: {question}",
44
+ name="Detailed Prompt",
45
+ description="A prompt that asks for detailed answers",
46
+ tags=["detailed", "test"]
47
+ )
48
+
49
+ print(f"Created prompt A with ID: {prompt_a.id}")
50
+ print(f"Created prompt B with ID: {prompt_b.id}")
51
+
52
+ # Create test cases
53
+ test_cases = []
54
+
55
+ questions = [
56
+ "What is machine learning?",
57
+ "How does a neural network work?",
58
+ "What are the benefits of version control?"
59
+ ]
60
+
61
+ for i, question in enumerate(questions):
62
+ test_case = testing.create_test_case(
63
+ prompt_id=prompt_a.id,
64
+ input_vars={"question": question},
65
+ name=f"Test Case {i+1}",
66
+ description=f"Test case for question: {question}"
67
+ )
68
+ test_cases.append(test_case.id)
69
+
70
+ print(f"Created {len(test_cases)} test cases")
71
+
72
+ # Define metrics callbacks
73
+ def length_metric(output, expected):
74
+ """Measure output length as a metric."""
75
+ return {"length": len(output) / 1000} # Normalize to 0-1 range
76
+
77
+ def keyword_metric(output, expected):
78
+ """Check for presence of keywords."""
79
+ keywords = ["machine", "learning", "neural", "network", "version", "control"]
80
+ matches = sum(1 for k in keywords if k.lower() in output.lower())
81
+ return {"keyword_matches": matches / len(keywords)}
82
+
83
+ # Run A/B test
84
+ ab_result = await testing.run_ab_test(
85
+ prompt_a_id=prompt_a.id,
86
+ prompt_b_id=prompt_b.id,
87
+ llm_callback=llm_callback,
88
+ metrics_callbacks=[length_metric, keyword_metric],
89
+ test_cases=test_cases
90
+ )
91
+
92
+ print(f"A/B test completed with ID: {ab_result.id}")
93
+ print(f"Prompt A metrics: {ab_result.metrics_a}")
94
+ print(f"Prompt B metrics: {ab_result.metrics_b}")
95
+ print(f"Winner: {ab_result.winner or 'Tie'}")
96
+
97
+ # List all test results
98
+ results_a = testing.get_test_results(prompt_id=prompt_a.id)
99
+ results_b = testing.get_test_results(prompt_id=prompt_b.id)
100
+
101
+ print(f"Found {len(results_a)} test results for prompt A")
102
+ print(f"Found {len(results_b)} test results for prompt B")
103
+
104
+ # Display individual test results
105
+ print("\nSample outputs:")
106
+
107
+ for i, (result_a, result_b) in enumerate(zip(results_a[:3], results_b[:3])):
108
+ print(f"\nTest Case {i+1}:")
109
+
110
+ print("\nConcise prompt output:")
111
+ print(result_a.output)
112
+
113
+ print("\nDetailed prompt output:")
114
+ print(result_b.output)
115
+
116
+ if __name__ == "__main__":
117
+ asyncio.run(main())
promptlab/examples/basic_usage.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ Basic usage example for PromptLab.
4
+
5
+ This example demonstrates the fundamental features of PromptLab
6
+ including creating prompts, versioning, and rendering.
7
+ """
8
+
9
+ import asyncio
10
+ import os
11
+ from promptlab import PromptManager, VersionControl
12
+
13
+ async def main():
14
+ # Initialize the prompt manager with a custom storage path
15
+ storage_path = os.path.join(os.getcwd(), "promptlab_storage")
16
+ prompt_manager = PromptManager(storage_path)
17
+
18
+ # Initialize version control
19
+ version_control = VersionControl(prompt_manager)
20
+
21
+ # Create a basic prompt
22
+ basic_prompt = prompt_manager.create(
23
+ content="Hello, my name is {name} and I am a {occupation}.",
24
+ name="Introduction",
25
+ description="A simple introduction prompt",
26
+ tags=["basic", "introduction"]
27
+ )
28
+
29
+ print(f"Created prompt with ID: {basic_prompt.id}")
30
+
31
+ # Render the prompt with variables
32
+ rendered = basic_prompt.render(name="Alice", occupation="Data Scientist")
33
+ print(f"Rendered prompt: {rendered}")
34
+
35
+ # Create a more complex prompt
36
+ complex_prompt = prompt_manager.create(
37
+ content="""
38
+ System: {system_message}
39
+
40
+ User: {user_message}
41
+
42
+ Assistant:
43
+ """,
44
+ name="Chat Interaction",
45
+ description="A prompt for chat interactions",
46
+ tags=["chat", "interaction"]
47
+ )
48
+
49
+ print(f"Created complex prompt with ID: {complex_prompt.id}")
50
+
51
+ # Render the complex prompt
52
+ rendered = complex_prompt.render(
53
+ system_message="You are a helpful assistant.",
54
+ user_message="Can you help me with Python programming?"
55
+ )
56
+ print(f"Rendered complex prompt:\n{rendered}")
57
+
58
+ # Create a version
59
+ version = version_control.commit(
60
+ prompt_id=complex_prompt.id,
61
+ commit_message="Initial version"
62
+ )
63
+
64
+ print(f"Created version {version.version} for prompt {complex_prompt.id}")
65
+
66
+ # Update the prompt
67
+ complex_prompt = prompt_manager.update(
68
+ complex_prompt.id,
69
+ content="""
70
+ System: {system_message}
71
+
72
+ User: {user_message}
73
+
74
+ Think step by step:
75
+ {thinking}
76
+
77
+ Assistant:
78
+ """
79
+ )
80
+
81
+ print(f"Updated prompt with ID: {complex_prompt.id}")
82
+
83
+ # Create another version
84
+ version = version_control.commit(
85
+ prompt_id=complex_prompt.id,
86
+ commit_message="Added thinking step"
87
+ )
88
+
89
+ print(f"Created version {version.version} for prompt {complex_prompt.id}")
90
+
91
+ # List all versions
92
+ versions = version_control.list_versions(complex_prompt.id)
93
+ print(f"Found {len(versions)} versions for prompt {complex_prompt.id}:")
94
+ for v in versions:
95
+ print(f"Version: {v.version} | Created: {v.created_at} | Message: {v.commit_message}")
96
+
97
+ # Checkout a specific version
98
+ prompt = version_control.checkout(complex_prompt.id, 1)
99
+ print(f"Checked out version 1 for prompt {complex_prompt.id}")
100
+ print(f"Content:\n{prompt.content}")
101
+
102
+ # List all prompts
103
+ prompts = prompt_manager.list()
104
+ print(f"Found {len(prompts)} prompts:")
105
+ for p in prompts:
106
+ print(f"ID: {p.id} | Name: {p.name} | Tags: {', '.join(p.tags)}")
107
+
108
+ if __name__ == "__main__":
109
+ asyncio.run(main())
promptlab/examples/evaluation_example.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation example for PromptLab.
3
+
4
+ This example demonstrates how to use PromptLab's evaluation framework
5
+ to measure the quality of prompts using various metrics.
6
+ """
7
+
8
+ import asyncio
9
+ import os
10
+ from promptlab import PromptManager, Evaluator, ContainsKeywordsMetric, LengthMetric
11
+
12
+ async def llm_callback(prompt, vars):
13
+ """
14
+ Simulated LLM callback for testing.
15
+
16
+ In a real scenario, this would call an actual LLM API.
17
+ """
18
+ # Simple simulation based on input text
19
+ text = vars.get("text", "")
20
+
21
+ if "code" in text.lower():
22
+ return "```python\ndef hello_world():\n print('Hello, world!')\n```"
23
+ elif "list" in text.lower():
24
+ return "1. First item\n2. Second item\n3. Third item"
25
+ elif "summary" in text.lower():
26
+ return f"This is a summary of the text about {text.split()[0]}."
27
+ else:
28
+ return f"Response to: {text}"
29
+
30
+ async def main():
31
+ # Initialize the prompt manager with a custom storage path
32
+ storage_path = os.path.join(os.getcwd(), "promptlab_storage")
33
+ prompt_manager = PromptManager(storage_path)
34
+
35
+ # Initialize evaluator
36
+ evaluator = Evaluator(prompt_manager)
37
+
38
+ # Create a prompt for evaluation
39
+ prompt = prompt_manager.create(
40
+ content="Please {action} the following text: {text}",
41
+ name="Dynamic Action Prompt",
42
+ description="A prompt that can perform different actions based on input",
43
+ tags=["action", "dynamic"]
44
+ )
45
+
46
+ print(f"Created prompt with ID: {prompt.id}")
47
+
48
+ # Register custom metrics
49
+ code_keywords = ContainsKeywordsMetric(
50
+ keywords=["def", "print", "function", "return"],
51
+ case_sensitive=False
52
+ )
53
+ evaluator.register_metric(code_keywords)
54
+
55
+ list_keywords = ContainsKeywordsMetric(
56
+ keywords=["1.", "2.", "3.", "item"],
57
+ case_sensitive=False
58
+ )
59
+ evaluator.register_metric(list_keywords)
60
+
61
+ length_metric = LengthMetric(min_length=10, max_length=500)
62
+ evaluator.register_metric(length_metric)
63
+
64
+ # Create test inputs for different actions
65
+ test_inputs = [
66
+ {"action": "write code for", "text": "a simple hello world function"},
67
+ {"action": "create a list of", "text": "three important items"},
68
+ {"action": "summarize", "text": "machine learning concepts in data science"},
69
+ {"action": "analyze", "text": "the impact of climate change on ecosystems"}
70
+ ]
71
+
72
+ # Run evaluation
73
+ evaluation_result = await evaluator.evaluate_prompt(
74
+ prompt_id=prompt.id,
75
+ inputs=test_inputs,
76
+ llm_callback=llm_callback
77
+ )
78
+
79
+ # Print evaluation results
80
+ print("\nEvaluation completed!")
81
+ print("\nAggregated metrics:")
82
+ for name, value in evaluation_result["aggregated_metrics"].items():
83
+ print(f"{name}: {value:.4f}")
84
+
85
+ print("\nIndividual results:")
86
+ for i, result in enumerate(evaluation_result["individual_results"]):
87
+ print(f"\nTest {i+1} ({result['input']['action']} {result['input']['text']}):")
88
+ print(f"Output: {result['output']}")
89
+
90
+ print("Metrics:")
91
+ for name, value in result["metrics"].items():
92
+ print(f" {name}: {value:.4f}")
93
+
94
+ if __name__ == "__main__":
95
+ asyncio.run(main())
promptlab/tests/__init__.py ADDED
File without changes
promptlab/tests/test_evaluation.py ADDED
File without changes
promptlab/tests/test_prompt_manager.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ from promptlab.core.prompt_manager import PromptManager, Prompt
6
+
7
+ class TestPromptManager(unittest.TestCase):
8
+ def setUp(self):
9
+ """Set up test environment."""
10
+ self.test_dir = tempfile.mkdtemp()
11
+ self.prompt_manager = PromptManager(self.test_dir)
12
+
13
+ def tearDown(self):
14
+ """Clean up test environment."""
15
+ shutil.rmtree(self.test_dir)
16
+
17
+ def test_create_prompt(self):
18
+ """Test creating a prompt."""
19
+ prompt = self.prompt_manager.create(
20
+ content="Test prompt {var}",
21
+ name="Test Prompt",
22
+ description="A test prompt",
23
+ tags=["test", "example"]
24
+ )
25
+
26
+ self.assertIsNotNone(prompt)
27
+ self.assertEqual(prompt.name, "Test Prompt")
28
+ self.assertEqual(prompt.content, "Test prompt {var}")
29
+ self.assertEqual(prompt.description, "A test prompt")
30
+ self.assertEqual(prompt.tags, ["test", "example"])
31
+
32
+ def test_get_prompt(self):
33
+ """Test getting a prompt."""
34
+ prompt = self.prompt_manager.create(
35
+ content="Test prompt",
36
+ name="Test Prompt"
37
+ )
38
+
39
+ retrieved = self.prompt_manager.get(prompt.id)
40
+
41
+ self.assertIsNotNone(retrieved)
42
+ self.assertEqual(retrieved.id, prompt.id)
43
+ self.assertEqual(retrieved.name, prompt.name)
44
+ self.assertEqual(retrieved.content, prompt.content)
45
+
46
+ def test_update_prompt(self):
47
+ """Test updating a prompt."""
48
+ prompt = self.prompt_manager.create(
49
+ content="Test prompt",
50
+ name="Test Prompt"
51
+ )
52
+
53
+ updated = self.prompt_manager.update(
54
+ prompt.id,
55
+ content="Updated prompt",
56
+ name="Updated Name"
57
+ )
58
+
59
+ self.assertEqual(updated.content, "Updated prompt")
60
+ self.assertEqual(updated.name, "Updated Name")
61
+
62
+ # Check that the update was persisted
63
+ retrieved = self.prompt_manager.get(prompt.id)
64
+ self.assertEqual(retrieved.content, "Updated prompt")
65
+ self.assertEqual(retrieved.name, "Updated Name")
66
+
67
+ def test_delete_prompt(self):
68
+ """Test deleting a prompt."""
69
+ prompt = self.prompt_manager.create(
70
+ content="Test prompt",
71
+ name="Test Prompt"
72
+ )
73
+
74
+ success = self.prompt_manager.delete(prompt.id)
75
+
76
+ self.assertTrue(success)
77
+ self.assertIsNone(self.prompt_manager.get(prompt.id))
78
+
79
+ def test_list_prompts(self):
80
+ """Test listing prompts."""
81
+ self.prompt_manager.create(
82
+ content="Test prompt 1",
83
+ name="Test Prompt 1",
84
+ tags=["test", "one"]
85
+ )
86
+
87
+ self.prompt_manager.create(
88
+ content="Test prompt 2",
89
+ name="Test Prompt 2",
90
+ tags=["test", "two"]
91
+ )
92
+
93
+ all_prompts = self.prompt_manager.list()
94
+ self.assertEqual(len(all_prompts), 2)
95
+
96
+ test_tag_prompts = self.prompt_manager.list(tags=["test"])
97
+ self.assertEqual(len(test_tag_prompts), 2)
98
+
99
+ one_tag_prompts = self.prompt_manager.list(tags=["one"])
100
+ self.assertEqual(len(one_tag_prompts), 1)
101
+ self.assertEqual(one_tag_prompts[0].name, "Test Prompt 1")
102
+
103
+ def test_render_prompt(self):
104
+ """Test rendering a prompt with variables."""
105
+ prompt = self.prompt_manager.create(
106
+ content="Hello, {name}! You are a {occupation}.",
107
+ name="Test Prompt"
108
+ )
109
+
110
+ rendered = prompt.render(name="Alice", occupation="Data Scientist")
111
+
112
+ self.assertEqual(rendered, "Hello, Alice! You are a Data Scientist.")
113
+
114
+ if __name__ == "__main__":
115
+ unittest.main()
promptlab/tests/test_testing.py ADDED
File without changes
promptlab/tests/test_version_control.py ADDED
File without changes
promptlab/utils/__init__.py ADDED
File without changes
promptlab/utils/metrics.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Any, Union, Callable
2
+ import re
3
+ import numpy as np
4
+ from difflib import SequenceMatcher
5
+
6
+ def exact_match(generated: str, expected: str) -> float:
7
+ """Calculate exact match score (1.0 if exact match, 0.0 otherwise)."""
8
+ if not expected or not generated:
9
+ return 0.0
10
+ return 1.0 if generated.strip() == expected.strip() else 0.0
11
+
12
+ def contains_all(generated: str, items: List[str], case_sensitive: bool = False) -> float:
13
+ """Check if generated text contains all items in the list."""
14
+ if not items:
15
+ return 0.0
16
+
17
+ if not case_sensitive:
18
+ generated = generated.lower()
19
+ items = [item.lower() for item in items]
20
+
21
+ matches = sum(1 for item in items if item in generated)
22
+ return matches / len(items)
23
+
24
+ def similarity_score(str1: str, str2: str) -> float:
25
+ """Calculate string similarity using difflib."""
26
+ if not str1 or not str2:
27
+ return 0.0
28
+ return SequenceMatcher(None, str1, str2).ratio()
29
+
30
+ def word_count(text: str) -> int:
31
+ """Count words in text."""
32
+ return len(re.findall(r'\w+', text))
33
+
34
+ def length_ratio(generated: str, expected: str) -> float:
35
+ """Calculate ratio of generated text length to expected text length."""
36
+ if not expected:
37
+ return 0.0
38
+
39
+ gen_length = len(generated)
40
+ exp_length = len(expected)
41
+
42
+ # Avoid division by zero
43
+ if exp_length == 0:
44
+ return 0.0 if gen_length > 0 else 1.0
45
+
46
+ # Return value between 0 and 1, with 1 being perfect match
47
+ # and decreasing as the ratio diverges from 1
48
+ ratio = gen_length / exp_length
49
+ return min(ratio, 1/ratio) if ratio > 0 else 0.0
50
+
51
+ def word_overlap(generated: str, expected: str) -> float:
52
+ """Calculate the word overlap between generated and expected text."""
53
+ if not expected or not generated:
54
+ return 0.0
55
+
56
+ gen_words = set(re.findall(r'\w+', generated.lower()))
57
+ exp_words = set(re.findall(r'\w+', expected.lower()))
58
+
59
+ if not exp_words:
60
+ return 0.0
61
+
62
+ intersection = gen_words.intersection(exp_words)
63
+ return len(intersection) / len(exp_words)
64
+
65
+ def keyword_presence(text: str, keywords: List[str], weight: Optional[Dict[str, float]] = None) -> Dict[str, float]:
66
+ """Check for presence of keywords with optional weights."""
67
+ if not keywords:
68
+ return {"keyword_score": 0.0}
69
+
70
+ text = text.lower()
71
+ result = {}
72
+
73
+ total_weight = 0
74
+ weighted_score = 0
75
+
76
+ for keyword in keywords:
77
+ keyword_lower = keyword.lower()
78
+ presence = 1.0 if keyword_lower in text else 0.0
79
+
80
+ # Apply weight if provided
81
+ kw_weight = weight.get(keyword, 1.0) if weight else 1.0
82
+ total_weight += kw_weight
83
+ weighted_score += presence * kw_weight
84
+
85
+ result[f"keyword_{keyword}"] = presence
86
+
87
+ # Calculate overall weighted score
88
+ if total_weight > 0:
89
+ result["keyword_score"] = weighted_score / total_weight
90
+ else:
91
+ result["keyword_score"] = 0.0
92
+
93
+ return result
94
+
95
+ class MetricsSet:
96
+ """A collection of evaluation metrics functions."""
97
+ def __init__(self):
98
+ self.metrics = {}
99
+
100
+ def add_metric(self, name: str, func: Callable, description: Optional[str] = None) -> None:
101
+ """Add a metric function to the set."""
102
+ self.metrics[name] = {
103
+ "function": func,
104
+ "description": description or ""
105
+ }
106
+
107
+ def evaluate(self, generated: str, expected: Optional[str] = None, **kwargs) -> Dict[str, float]:
108
+ """Evaluate all metrics on the given text."""
109
+ results = {}
110
+
111
+ for name, metric in self.metrics.items():
112
+ try:
113
+ # Different metrics may require different arguments
114
+ if expected is not None:
115
+ if "keywords" in kwargs and name == "keyword_presence":
116
+ result = metric["function"](generated, kwargs["keywords"])
117
+ else:
118
+ result = metric["function"](generated, expected)
119
+ else:
120
+ result = metric["function"](generated)
121
+
122
+ # Handle both single values and dictionaries
123
+ if isinstance(result, dict):
124
+ results.update(result)
125
+ else:
126
+ results[name] = result
127
+ except Exception as e:
128
+ results[name] = 0.0
129
+ print(f"Error calculating metric {name}: {e}")
130
+
131
+ return results
132
+
133
+ def create_default_metrics_set() -> MetricsSet:
134
+ """Create a MetricsSet with default metrics."""
135
+ metrics = MetricsSet()
136
+
137
+ metrics.add_metric(
138
+ "exact_match",
139
+ exact_match,
140
+ "Exact string match between expected and generated"
141
+ )
142
+
143
+ metrics.add_metric(
144
+ "similarity",
145
+ similarity_score,
146
+ "String similarity using difflib's SequenceMatcher"
147
+ )
148
+
149
+ metrics.add_metric(
150
+ "word_overlap",
151
+ word_overlap,
152
+ "Ratio of words in expected that appear in generated"
153
+ )
154
+
155
+ metrics.add_metric(
156
+ "length_ratio",
157
+ length_ratio,
158
+ "Ratio of generated text length to expected text length"
159
+ )
160
+
161
+ return metrics
promptlab/utils/storage.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import shutil
4
+ from typing import Dict, Any, Optional, List
5
+
6
+ class Storage:
7
+ """Handles persistent storage for PromptLab."""
8
+ def __init__(self, base_path: str):
9
+ self.base_path = base_path
10
+ os.makedirs(base_path, exist_ok=True)
11
+
12
+ def ensure_dir(self, dir_path: str) -> str:
13
+ """Ensure directory exists and return its path."""
14
+ full_path = os.path.join(self.base_path, dir_path)
15
+ os.makedirs(full_path, exist_ok=True)
16
+ return full_path
17
+
18
+ def save_json(self, dir_path: str, filename: str, data: Dict[str, Any]) -> str:
19
+ """Save data to a JSON file."""
20
+ dir_full_path = self.ensure_dir(dir_path)
21
+ file_path = os.path.join(dir_full_path, f"{filename}.json")
22
+
23
+ with open(file_path, "w") as f:
24
+ json.dump(data, f, indent=2)
25
+
26
+ return file_path
27
+
28
+ def load_json(self, dir_path: str, filename: str) -> Optional[Dict[str, Any]]:
29
+ """Load data from a JSON file."""
30
+ file_path = os.path.join(self.base_path, dir_path, f"{filename}.json")
31
+
32
+ if not os.path.exists(file_path):
33
+ return None
34
+
35
+ with open(file_path, "r") as f:
36
+ return json.load(f)
37
+
38
+ def list_files(self, dir_path: str, extension: Optional[str] = None) -> List[str]:
39
+ """List files in a directory, optionally filtered by extension."""
40
+ full_path = os.path.join(self.base_path, dir_path)
41
+
42
+ if not os.path.exists(full_path):
43
+ return []
44
+
45
+ files = os.listdir(full_path)
46
+
47
+ if extension:
48
+ return [f for f in files if f.endswith(extension)]
49
+
50
+ return files
51
+
52
+ def delete_file(self, dir_path: str, filename: str) -> bool:
53
+ """Delete a file."""
54
+ file_path = os.path.join(self.base_path, dir_path, filename)
55
+
56
+ if os.path.exists(file_path):
57
+ os.remove(file_path)
58
+ return True
59
+
60
+ return False
61
+
62
+ def backup(self, backup_path: Optional[str] = None) -> str:
63
+ """Create a backup of the entire storage."""
64
+ if not backup_path:
65
+ backup_path = f"{self.base_path}_backup"
66
+
67
+ shutil.make_archive(backup_path, "zip", self.base_path)
68
+ return f"{backup_path}.zip"
69
+
70
+ def restore(self, backup_path: str) -> bool:
71
+ """Restore from a backup archive."""
72
+ if not os.path.exists(backup_path):
73
+ return False
74
+
75
+ shutil.rmtree(self.base_path, ignore_errors=True)
76
+ os.makedirs(self.base_path, exist_ok=True)
77
+
78
+ shutil.unpack_archive(backup_path, self.base_path)
79
+ return True
promptlab/utils/templating.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from typing import Dict, Any, List, Optional, Union, Callable
4
+ from string import Formatter
5
+
6
+ class TemplateError(Exception):
7
+ """Exception raised for errors in template rendering."""
8
+ pass
9
+
10
+ class PromptTemplate:
11
+ """Advanced templating system for prompts."""
12
+ def __init__(self, template: str):
13
+ self.template = template
14
+ self._validate_template()
15
+
16
+ def _validate_template(self) -> None:
17
+ """Validate template syntax."""
18
+ try:
19
+ # Check for basic placeholder syntax
20
+ list(Formatter().parse(self.template))
21
+
22
+ # Check for conditional syntax
23
+ self._validate_conditionals()
24
+
25
+ # Check for loop syntax
26
+ self._validate_loops()
27
+ except Exception as e:
28
+ raise TemplateError(f"Invalid template syntax: {str(e)}")
29
+
30
+ def _validate_conditionals(self) -> None:
31
+ """Validate conditional blocks in the template."""
32
+ # Simple validation to ensure if/endif blocks match
33
+ if_count = len(re.findall(r'\{\s*if\s+.*?\s*\}', self.template))
34
+ endif_count = len(re.findall(r'\{\s*endif\s*\}', self.template))
35
+
36
+ if if_count != endif_count:
37
+ raise TemplateError(f"Mismatched conditional blocks: {if_count} 'if' and {endif_count} 'endif'")
38
+
39
+ def _validate_loops(self) -> None:
40
+ """Validate loop blocks in the template."""
41
+ # Simple validation to ensure for/endfor blocks match
42
+ for_count = len(re.findall(r'\{\s*for\s+.*?\s*\}', self.template))
43
+ endfor_count = len(re.findall(r'\{\s*endfor\s*\}', self.template))
44
+
45
+ if for_count != endfor_count:
46
+ raise TemplateError(f"Mismatched loop blocks: {for_count} 'for' and {endfor_count} 'endfor'")
47
+
48
+ def _render_conditionals(self, template: str, variables: Dict[str, Any]) -> str:
49
+ """Process conditional blocks in the template."""
50
+ # Handle if-else-endif blocks
51
+ pattern = r'\{\s*if\s+(.*?)\s*\}(.*?)(?:\{\s*else\s*\}(.*?))?\{\s*endif\s*\}'
52
+
53
+ def replace_conditional(match):
54
+ condition = match.group(1)
55
+ if_block = match.group(2)
56
+ else_block = match.group(3) or ""
57
+
58
+ # Evaluate condition
59
+ try:
60
+ # Replace variables in condition
61
+ for var_name, var_value in variables.items():
62
+ if isinstance(var_value, str):
63
+ # For strings, replace with quoted value
64
+ condition = condition.replace(var_name, f'"{var_value}"')
65
+ else:
66
+ # For other types, replace directly
67
+ condition = condition.replace(var_name, str(var_value))
68
+
69
+ result = eval(condition, {"__builtins__": {}}, variables)
70
+ return if_block if result else else_block
71
+ except Exception as e:
72
+ raise TemplateError(f"Error evaluating condition '{condition}': {str(e)}")
73
+
74
+ # Use re.DOTALL to match across multiple lines
75
+ return re.sub(pattern, replace_conditional, template, flags=re.DOTALL)
76
+
77
+ def _render_loops(self, template: str, variables: Dict[str, Any]) -> str:
78
+ """Process loop blocks in the template."""
79
+ # Handle for loops
80
+ pattern = r'\{\s*for\s+(.*?)\s+in\s+(.*?)\s*\}(.*?)\{\s*endfor\s*\}'
81
+
82
+ def replace_loop(match):
83
+ var_name = match.group(1)
84
+ iterable_expr = match.group(2)
85
+ loop_body = match.group(3)
86
+
87
+ try:
88
+ # Get the iterable from variables
89
+ if iterable_expr in variables and hasattr(variables[iterable_expr], '__iter__'):
90
+ iterable = variables[iterable_expr]
91
+ else:
92
+ # Try to evaluate the expression
93
+ iterable = eval(iterable_expr, {"__builtins__": {}}, variables)
94
+
95
+ if not hasattr(iterable, '__iter__'):
96
+ raise TemplateError(f"'{iterable_expr}' is not iterable")
97
+
98
+ # Process the loop body for each item
99
+ result = []
100
+ for item in iterable:
101
+ # Create a copy of variables with loop variable
102
+ loop_vars = variables.copy()
103
+ loop_vars[var_name] = item
104
+
105
+ # Process the loop body with the new variables
106
+ body_content = loop_body
107
+ for k, v in loop_vars.items():
108
+ placeholder = f"{{{k}}}"
109
+ if placeholder in body_content:
110
+ body_content = body_content.replace(placeholder, str(v))
111
+
112
+ result.append(body_content)
113
+
114
+ return "".join(result)
115
+ except Exception as e:
116
+ raise TemplateError(f"Error processing loop '{match.group(0)}': {str(e)}")
117
+
118
+ # Use re.DOTALL to match across multiple lines
119
+ return re.sub(pattern, replace_loop, template, flags=re.DOTALL)
120
+
121
+ def _apply_filters(self, value: Any, filters: List[str]) -> str:
122
+ """Apply filters to a value."""
123
+ result = value
124
+ for filter_name in filters:
125
+ if filter_name == "upper":
126
+ result = str(result).upper()
127
+ elif filter_name == "lower":
128
+ result = str(result).lower()
129
+ elif filter_name == "title":
130
+ result = str(result).title()
131
+ elif filter_name == "capitalize":
132
+ result = str(result).capitalize()
133
+ elif filter_name == "strip":
134
+ result = str(result).strip()
135
+ elif filter_name == "json":
136
+ result = json.dumps(result)
137
+ else:
138
+ raise TemplateError(f"Unknown filter: {filter_name}")
139
+ return result
140
+
141
+ def _render_variables(self, template: str, variables: Dict[str, Any]) -> str:
142
+ """Replace variables in the template with their values."""
143
+ result = template
144
+
145
+ # Process variables with filters
146
+ pattern = r'\{(.*?)(?:\|(.*?))?\}'
147
+
148
+ def replace_var(match):
149
+ var_expr = match.group(1).strip()
150
+ filters_expr = match.group(2)
151
+
152
+ # Extract filters
153
+ filters = []
154
+ if filters_expr:
155
+ filters = [f.strip() for f in filters_expr.split('|')]
156
+
157
+ try:
158
+ # Simple variable
159
+ if var_expr in variables:
160
+ value = variables[var_expr]
161
+ else:
162
+ # Try to evaluate as an expression
163
+ try:
164
+ value = eval(var_expr, {"__builtins__": {}}, variables)
165
+ except:
166
+ return match.group(0) # Keep as is if evaluation fails
167
+
168
+ # Apply filters
169
+ return str(self._apply_filters(value, filters))
170
+ except Exception as e:
171
+ raise TemplateError(f"Error processing variable '{var_expr}': {str(e)}")
172
+
173
+ return re.sub(pattern, replace_var, result)
174
+
175
+ def render(self, **kwargs) -> str:
176
+ """Render the template with provided variables."""
177
+ result = self.template
178
+
179
+ # Process templates in multiple passes
180
+ # First, handle conditional blocks
181
+ result = self._render_conditionals(result, kwargs)
182
+
183
+ # Then, handle loops
184
+ result = self._render_loops(result, kwargs)
185
+
186
+ # Finally, handle simple variable substitution
187
+ result = self._render_variables(result, kwargs)
188
+
189
+ return result
190
+
191
+
192
+ class PromptTemplateRegistry:
193
+ """Registry for prompt templates."""
194
+ def __init__(self):
195
+ self.templates: Dict[str, PromptTemplate] = {}
196
+
197
+ def register(self, name: str, template: Union[str, PromptTemplate]) -> None:
198
+ """Register a template."""
199
+ if isinstance(template, str):
200
+ template = PromptTemplate(template)
201
+ self.templates[name] = template
202
+
203
+ def get(self, name: str) -> Optional[PromptTemplate]:
204
+ """Get a template by name."""
205
+ return self.templates.get(name)
206
+
207
+ def render(self, name: str, **kwargs) -> str:
208
+ """Render a template by name."""
209
+ template = self.get(name)
210
+ if not template:
211
+ raise ValueError(f"Template '{name}' not found")
212
+ return template.render(**kwargs)
213
+
214
+ def list_templates(self) -> List[str]:
215
+ """List all registered templates."""
216
+ return list(self.templates.keys())
217
+
218
+
219
+ # Create a singleton instance
220
+ template_registry = PromptTemplateRegistry()
221
+
222
+ # Register some common templates
223
+ template_registry.register(
224
+ "basic_completion",
225
+ """
226
+ {system_message}
227
+
228
+ {user_message}
229
+ """
230
+ )
231
+
232
+ template_registry.register(
233
+ "chat_template",
234
+ """
235
+ {system_message}
236
+
237
+ {for message in conversation}
238
+ {if message.role == "user"}Human: {message.content}
239
+ {else}Assistant: {message.content}
240
+ {endif}
241
+ {endfor}
242
+ """
243
+ )
244
+
245
+ template_registry.register(
246
+ "few_shot",
247
+ """
248
+ {system_message}
249
+
250
+ Here are some examples:
251
+ {for example in examples}
252
+ Input: {example.input}
253
+ Output: {example.output}
254
+ {endfor}
255
+
256
+ Input: {input}
257
+ Output:
258
+ """
259
+ )
pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "promptlab"
7
+ version = "0.1.0"
8
+ description = "A comprehensive LLM Prompt Management System"
9
+ readme = "README.md"
10
+ requires-python = ">=3.7"
11
+ license = {text = "MIT"}
12
+ keywords = ["llm", "prompt engineering", "nlp", "machine learning"]
13
+ authors = [
14
+ {name = "Biswanath Roul"}
15
+ ]
16
+ maintainers = [
17
+ {name = "Biswanath Roul"}
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "Intended Audience :: Science/Research",
23
+ "License :: OSI Approved :: MIT License",
24
+ "Programming Language :: Python :: 3",
25
+ "Programming Language :: Python :: 3.7",
26
+ "Programming Language :: Python :: 3.8",
27
+ "Programming Language :: Python :: 3.9",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ ]
31
+ dependencies = [
32
+ "numpy>=1.20.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ "Homepage" = "https://github.com/biswanathroul/promptlab"
37
+ "Bug Tracker" = "https://github.com/biswanathroul/promptlab/issues"
38
+ "Documentation" = "https://github.com/biswanathroul/promptlab/wiki"
39
+ "Source Code" = "https://github.com/biswanathroul/promptlab"
40
+
41
+ [project.scripts]
42
+ promptlab = "promptlab.cli.commands:main"
43
+
44
+ [tool.setuptools]
45
+ packages = ["promptlab", "promptlab.core", "promptlab.cli", "promptlab.utils"]