biswanath2.roul
commited on
Commit
·
e54fd17
0
Parent(s):
Initial commit
Browse files- .DS_Store +0 -0
- .gitignore +60 -0
- LICENSE +21 -0
- README.md +167 -0
- docs/README.md +11 -0
- docs/advanced_features.md +268 -0
- docs/api_reference.md +247 -0
- docs/cli_usage.md +118 -0
- docs/getting_started.md +110 -0
- docs/integration_examples.md +584 -0
- promptlab/__init__.py +39 -0
- promptlab/cli/__init__.py +0 -0
- promptlab/cli/commands.py +697 -0
- promptlab/core/__init__.py +0 -0
- promptlab/core/evaluation.py +191 -0
- promptlab/core/prompt_manager.py +169 -0
- promptlab/core/testing.py +451 -0
- promptlab/core/version_control.py +161 -0
- promptlab/examples/__init__.py +0 -0
- promptlab/examples/ab_testing.py +117 -0
- promptlab/examples/basic_usage.py +109 -0
- promptlab/examples/evaluation_example.py +95 -0
- promptlab/tests/__init__.py +0 -0
- promptlab/tests/test_evaluation.py +0 -0
- promptlab/tests/test_prompt_manager.py +115 -0
- promptlab/tests/test_testing.py +0 -0
- promptlab/tests/test_version_control.py +0 -0
- promptlab/utils/__init__.py +0 -0
- promptlab/utils/metrics.py +161 -0
- promptlab/utils/storage.py +79 -0
- promptlab/utils/templating.py +259 -0
- pyproject.toml +45 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
MANIFEST
|
23 |
+
|
24 |
+
# Virtual environments
|
25 |
+
env/
|
26 |
+
venv/
|
27 |
+
ENV/
|
28 |
+
env.bak/
|
29 |
+
venv.bak/
|
30 |
+
pl200525/
|
31 |
+
|
32 |
+
# Jupyter Notebook
|
33 |
+
.ipynb_checkpoints
|
34 |
+
|
35 |
+
# Prompt storage (for local development)
|
36 |
+
promptlab_storage/
|
37 |
+
|
38 |
+
# IDE
|
39 |
+
.idea/
|
40 |
+
.vscode/
|
41 |
+
*.swp
|
42 |
+
*.swo
|
43 |
+
|
44 |
+
# Distribution / packaging
|
45 |
+
.Python
|
46 |
+
env/
|
47 |
+
build/
|
48 |
+
develop-eggs/
|
49 |
+
dist/
|
50 |
+
downloads/
|
51 |
+
eggs/
|
52 |
+
.eggs/
|
53 |
+
lib/
|
54 |
+
lib64/
|
55 |
+
parts/
|
56 |
+
sdist/
|
57 |
+
var/
|
58 |
+
*.egg-info/
|
59 |
+
.installed.cfg
|
60 |
+
*.egg
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Biswanath Roul
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PromptLab: LLM Prompt Management System
|
2 |
+
|
3 |
+
PromptLab is a comprehensive library for managing, versioning, testing, and evaluating prompts for Large Language Models (LLMs). It provides a structured framework to help data scientists and developers create, optimize, and maintain high-quality prompts.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **Prompt Management**: Create, update, and organize prompts with metadata and tags
|
8 |
+
- **Version Control**: Track prompt changes over time with full version history
|
9 |
+
- **A/B Testing**: Compare different prompt variations to find the most effective one
|
10 |
+
- **Evaluation Framework**: Measure prompt quality with customizable metrics
|
11 |
+
- **Advanced Templating**: Create dynamic prompts with variables, conditionals, and loops
|
12 |
+
- **Command-line Interface**: Easily integrate into your workflow
|
13 |
+
|
14 |
+
## Documentation
|
15 |
+
|
16 |
+
For detailed documentation, see the [docs](./docs) directory:
|
17 |
+
|
18 |
+
- [Getting Started](./docs/getting_started.md)
|
19 |
+
- [API Reference](./docs/api_reference.md)
|
20 |
+
- [CLI Usage](./docs/cli_usage.md)
|
21 |
+
- [Advanced Features](./docs/advanced_features.md)
|
22 |
+
- [Integration Examples](./docs/integration_examples.md)
|
23 |
+
|
24 |
+
## Installation
|
25 |
+
|
26 |
+
```bash
|
27 |
+
pip install promptlab
|
28 |
+
|
29 |
+
Quick Start
|
30 |
+
|
31 |
+
from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
|
32 |
+
|
33 |
+
# Initialize components
|
34 |
+
prompt_manager = PromptManager()
|
35 |
+
version_control = VersionControl(prompt_manager)
|
36 |
+
testing = PromptTesting(prompt_manager)
|
37 |
+
evaluator = Evaluator(prompt_manager)
|
38 |
+
|
39 |
+
# Create a prompt
|
40 |
+
prompt = prompt_manager.create(
|
41 |
+
content="Summarize the following text: {text}",
|
42 |
+
name="Simple Summarization",
|
43 |
+
description="A simple prompt for text summarization",
|
44 |
+
tags=["summarization", "basic"]
|
45 |
+
)
|
46 |
+
|
47 |
+
# Create a new version
|
48 |
+
version_control.commit(
|
49 |
+
prompt_id=prompt.id,
|
50 |
+
commit_message="Initial version"
|
51 |
+
)
|
52 |
+
|
53 |
+
# Update the prompt
|
54 |
+
prompt_manager.update(
|
55 |
+
prompt.id,
|
56 |
+
content="Please provide a concise summary of the following text in 2-3 sentences: {text}"
|
57 |
+
)
|
58 |
+
|
59 |
+
# Commit the updated version
|
60 |
+
version_control.commit(
|
61 |
+
prompt_id=prompt.id,
|
62 |
+
commit_message="Improved prompt with length guidance"
|
63 |
+
)
|
64 |
+
|
65 |
+
# Create a test case
|
66 |
+
test_case = testing.create_test_case(
|
67 |
+
prompt_id=prompt.id,
|
68 |
+
input_vars={"text": "Lorem ipsum dolor sit amet..."},
|
69 |
+
expected_output="This is a summary of the text."
|
70 |
+
)
|
71 |
+
|
72 |
+
# Define an LLM callback for testing
|
73 |
+
async def llm_callback(prompt, vars):
|
74 |
+
# In a real scenario, this would call an actual LLM API
|
75 |
+
return "This is a summary of the text."
|
76 |
+
|
77 |
+
# Run the test case
|
78 |
+
import asyncio
|
79 |
+
test_result = asyncio.run(testing.run_test_case(
|
80 |
+
test_case_id=test_case.id,
|
81 |
+
llm_callback=llm_callback
|
82 |
+
))
|
83 |
+
|
84 |
+
# Evaluate a prompt with multiple inputs
|
85 |
+
evaluation_result = asyncio.run(evaluator.evaluate_prompt(
|
86 |
+
prompt_id=prompt.id,
|
87 |
+
inputs=[{"text": "Sample text 1"}, {"text": "Sample text 2"}],
|
88 |
+
llm_callback=llm_callback
|
89 |
+
))
|
90 |
+
|
91 |
+
print(f"Evaluation metrics: {evaluation_result['aggregated_metrics']}")
|
92 |
+
|
93 |
+
Command-line Interface
|
94 |
+
PromptLab comes with a powerful CLI for managing prompts:
|
95 |
+
|
96 |
+
# Create a prompt
|
97 |
+
promptlab prompt create "Summarization" --content "Summarize: {text}" --tags "summarization,basic"
|
98 |
+
|
99 |
+
# List all prompts
|
100 |
+
promptlab prompt list
|
101 |
+
|
102 |
+
# Create a new version
|
103 |
+
promptlab version commit <prompt_id> --message "Updated prompt"
|
104 |
+
|
105 |
+
# Run tests
|
106 |
+
promptlab test run-all <prompt_id> --llm openai
|
107 |
+
|
108 |
+
Advanced Usage
|
109 |
+
Advanced Templating
|
110 |
+
PromptLab supports advanced templating with conditionals and loops:
|
111 |
+
|
112 |
+
from promptlab import PromptTemplate
|
113 |
+
|
114 |
+
template = PromptTemplate("""
|
115 |
+
{system_message}
|
116 |
+
|
117 |
+
{for example in examples}
|
118 |
+
Input: {example.input}
|
119 |
+
Output: {example.output}
|
120 |
+
{endfor}
|
121 |
+
|
122 |
+
Input: {input}
|
123 |
+
Output:
|
124 |
+
""")
|
125 |
+
|
126 |
+
rendered = template.render(
|
127 |
+
system_message="You are a helpful assistant.",
|
128 |
+
examples=[
|
129 |
+
{"input": "Hello", "output": "Hi there!"},
|
130 |
+
{"input": "How are you?", "output": "I'm doing well, thanks!"}
|
131 |
+
],
|
132 |
+
input="What's the weather like?"
|
133 |
+
)
|
134 |
+
|
135 |
+
Custom Evaluation Metrics
|
136 |
+
Create custom metrics to evaluate prompt performance:
|
137 |
+
from promptlab import EvaluationMetric, Evaluator
|
138 |
+
|
139 |
+
class CustomMetric(EvaluationMetric):
|
140 |
+
def __init__(self):
|
141 |
+
super().__init__("custom_metric", "My custom evaluation metric")
|
142 |
+
|
143 |
+
def compute(self, generated_output, expected_output=None, **kwargs):
|
144 |
+
# Custom logic to score the output
|
145 |
+
return score # A float between 0 and 1
|
146 |
+
|
147 |
+
# Register the custom metric
|
148 |
+
evaluator = Evaluator(prompt_manager)
|
149 |
+
evaluator.register_metric(CustomMetric())
|
150 |
+
|
151 |
+
Use Cases
|
152 |
+
|
153 |
+
Prompt Development: Iteratively develop and refine prompts with version control
|
154 |
+
Prompt Optimization: A/B test different prompt variations to find the most effective approach
|
155 |
+
Quality Assurance: Ensure prompt quality with automated testing and evaluation
|
156 |
+
Team Collaboration: Share and collaborate on prompts with a centralized management system
|
157 |
+
Production Deployment: Maintain consistent prompt quality in production applications
|
158 |
+
|
159 |
+
License
|
160 |
+
MIT License
|
161 |
+
|
162 |
+
## Contributing
|
163 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
164 |
+
|
165 |
+
## Author
|
166 |
+
Biswanath Roul - [GitHub](https://github.com/biswanathroul)
|
167 |
+
|
docs/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PromptLab Documentation
|
2 |
+
|
3 |
+
This directory contains detailed documentation for the PromptLab library.
|
4 |
+
|
5 |
+
## Contents
|
6 |
+
|
7 |
+
- [Getting Started](./getting_started.md)
|
8 |
+
- [API Reference](./api_reference.md)
|
9 |
+
- [CLI Usage](./cli_usage.md)
|
10 |
+
- [Advanced Features](./advanced_features.md)
|
11 |
+
- [Integration Examples](./integration_examples.md)
|
docs/advanced_features.md
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Advanced Features
|
2 |
+
|
3 |
+
PromptLab provides several advanced features for sophisticated prompt engineering.
|
4 |
+
|
5 |
+
## Advanced Templating
|
6 |
+
|
7 |
+
PromptLab's templating system goes beyond simple variable substitution, offering conditionals and loops.
|
8 |
+
|
9 |
+
### Basic Variable Substitution
|
10 |
+
|
11 |
+
```python
|
12 |
+
from promptlab import PromptTemplate
|
13 |
+
|
14 |
+
# Simple variable substitution
|
15 |
+
template = PromptTemplate("Hello, {name}!")
|
16 |
+
rendered = template.render(name="John")
|
17 |
+
# Result: "Hello, John!"
|
18 |
+
```
|
19 |
+
|
20 |
+
### Conditional Logic
|
21 |
+
|
22 |
+
```python
|
23 |
+
# Conditionals
|
24 |
+
template = PromptTemplate("""
|
25 |
+
{if is_formal}
|
26 |
+
Dear {name},
|
27 |
+
|
28 |
+
I hope this message finds you well.
|
29 |
+
{else}
|
30 |
+
Hey {name}!
|
31 |
+
{endif}
|
32 |
+
|
33 |
+
{message}
|
34 |
+
""")
|
35 |
+
|
36 |
+
formal = template.render(is_formal=True, name="Dr. Smith", message="Please review the attached document.")
|
37 |
+
casual = template.render(is_formal=False, name="Bob", message="Want to grab lunch?")
|
38 |
+
```
|
39 |
+
|
40 |
+
### Loops
|
41 |
+
|
42 |
+
```python
|
43 |
+
# Loops
|
44 |
+
template = PromptTemplate("""
|
45 |
+
Here are your tasks:
|
46 |
+
|
47 |
+
{for task in tasks}
|
48 |
+
- {task.priority}: {task.description}
|
49 |
+
{endfor}
|
50 |
+
""")
|
51 |
+
|
52 |
+
rendered = template.render(tasks=[
|
53 |
+
{"priority": "High", "description": "Complete the report"},
|
54 |
+
{"priority": "Medium", "description": "Schedule meeting"},
|
55 |
+
{"priority": "Low", "description": "Organize files"}
|
56 |
+
])
|
57 |
+
```
|
58 |
+
|
59 |
+
### Nested Structures
|
60 |
+
|
61 |
+
```python
|
62 |
+
# Combining loops and conditionals
|
63 |
+
template = PromptTemplate("""
|
64 |
+
{system_message}
|
65 |
+
|
66 |
+
{for example in examples}
|
67 |
+
User: {example.input}
|
68 |
+
{if example.has_reasoning}
|
69 |
+
Reasoning: {example.reasoning}
|
70 |
+
{endif}
|
71 |
+
Assistant: {example.output}
|
72 |
+
{endfor}
|
73 |
+
|
74 |
+
User: {query}
|
75 |
+
Assistant:
|
76 |
+
""")
|
77 |
+
|
78 |
+
rendered = template.render(
|
79 |
+
system_message="You are a helpful assistant.",
|
80 |
+
examples=[
|
81 |
+
{
|
82 |
+
"input": "What's 2+2?",
|
83 |
+
"has_reasoning": True,
|
84 |
+
"reasoning": "Adding 2 and 2 gives 4",
|
85 |
+
"output": "4"
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"input": "Hello",
|
89 |
+
"has_reasoning": False,
|
90 |
+
"output": "Hi there! How can I help you today?"
|
91 |
+
}
|
92 |
+
],
|
93 |
+
query="What's the capital of France?"
|
94 |
+
)
|
95 |
+
```
|
96 |
+
|
97 |
+
## Custom Evaluation Metrics
|
98 |
+
|
99 |
+
You can create custom metrics to evaluate prompt outputs based on your specific requirements.
|
100 |
+
|
101 |
+
### Creating a Custom Metric
|
102 |
+
|
103 |
+
```python
|
104 |
+
from promptlab import EvaluationMetric
|
105 |
+
|
106 |
+
class RelevanceMetric(EvaluationMetric):
|
107 |
+
"""Evaluates relevance of output to a given topic."""
|
108 |
+
|
109 |
+
def __init__(self, topics):
|
110 |
+
super().__init__("relevance", "Evaluates relevance to specified topics")
|
111 |
+
self.topics = topics
|
112 |
+
|
113 |
+
def compute(self, generated_output, expected_output=None, **kwargs):
|
114 |
+
"""
|
115 |
+
Compute relevance score based on topic presence.
|
116 |
+
Returns a float between 0 and 1.
|
117 |
+
"""
|
118 |
+
score = 0
|
119 |
+
output_lower = generated_output.lower()
|
120 |
+
|
121 |
+
for topic in self.topics:
|
122 |
+
if topic.lower() in output_lower:
|
123 |
+
score += 1
|
124 |
+
|
125 |
+
# Normalize to 0-1 range
|
126 |
+
return min(1.0, score / len(self.topics)) if self.topics else 0.0
|
127 |
+
```
|
128 |
+
|
129 |
+
### Using Custom Metrics
|
130 |
+
|
131 |
+
```python
|
132 |
+
from promptlab import Evaluator, PromptManager
|
133 |
+
|
134 |
+
# Initialize components
|
135 |
+
prompt_manager = PromptManager()
|
136 |
+
evaluator = Evaluator(prompt_manager)
|
137 |
+
|
138 |
+
# Register custom metric
|
139 |
+
climate_relevance = RelevanceMetric(["climate", "temperature", "warming", "environment"])
|
140 |
+
evaluator.register_metric(climate_relevance)
|
141 |
+
|
142 |
+
# Use in evaluation
|
143 |
+
async def my_llm(prompt, vars):
|
144 |
+
# Call your LLM API here
|
145 |
+
return "Climate change is causing global temperature increases..."
|
146 |
+
|
147 |
+
results = await evaluator.evaluate_prompt(
|
148 |
+
prompt_id="abc123",
|
149 |
+
inputs=[{"topic": "climate change"}],
|
150 |
+
llm_callback=my_llm,
|
151 |
+
metric_names=["relevance"] # Use our custom metric
|
152 |
+
)
|
153 |
+
|
154 |
+
print(f"Relevance score: {results['aggregated_metrics']['relevance']}")
|
155 |
+
```
|
156 |
+
|
157 |
+
## Customizing Storage
|
158 |
+
|
159 |
+
PromptLab allows you to customize where and how prompts and related data are stored.
|
160 |
+
|
161 |
+
### Custom Storage Locations
|
162 |
+
|
163 |
+
```python
|
164 |
+
# Specify a custom storage location
|
165 |
+
prompt_manager = PromptManager("/path/to/my/prompts")
|
166 |
+
|
167 |
+
# Export/import prompts
|
168 |
+
import json
|
169 |
+
|
170 |
+
# Export a prompt to a file
|
171 |
+
prompt = prompt_manager.get("abc123")
|
172 |
+
with open("exported_prompt.json", "w") as f:
|
173 |
+
json.dump(prompt.to_dict(), f, indent=2)
|
174 |
+
|
175 |
+
# Import a prompt from a file
|
176 |
+
with open("exported_prompt.json", "r") as f:
|
177 |
+
data = json.load(f)
|
178 |
+
imported_prompt = prompt_manager.import_prompt(data)
|
179 |
+
```
|
180 |
+
|
181 |
+
## LLM Integration
|
182 |
+
|
183 |
+
PromptLab is designed to work with any LLM through callback functions. Here are examples of integrating with popular LLM APIs.
|
184 |
+
|
185 |
+
### OpenAI Integration
|
186 |
+
|
187 |
+
```python
|
188 |
+
import openai
|
189 |
+
from promptlab import PromptManager, PromptTesting
|
190 |
+
|
191 |
+
prompt_manager = PromptManager()
|
192 |
+
testing = PromptTesting(prompt_manager)
|
193 |
+
|
194 |
+
# Configure OpenAI
|
195 |
+
openai.api_key = "your-api-key"
|
196 |
+
|
197 |
+
# OpenAI callback function
|
198 |
+
async def openai_callback(prompt, vars):
|
199 |
+
response = openai.ChatCompletion.create(
|
200 |
+
model="gpt-4",
|
201 |
+
messages=[{"role": "user", "content": prompt}],
|
202 |
+
temperature=0.7,
|
203 |
+
max_tokens=150
|
204 |
+
)
|
205 |
+
return response.choices[0].message.content
|
206 |
+
|
207 |
+
# Run tests with OpenAI
|
208 |
+
test_results = await testing.run_all_tests("abc123", openai_callback)
|
209 |
+
```
|
210 |
+
|
211 |
+
### Anthropic Integration
|
212 |
+
|
213 |
+
```python
|
214 |
+
import anthropic
|
215 |
+
from promptlab import PromptManager, Evaluator
|
216 |
+
|
217 |
+
prompt_manager = PromptManager()
|
218 |
+
evaluator = Evaluator(prompt_manager)
|
219 |
+
|
220 |
+
# Configure Anthropic
|
221 |
+
client = anthropic.Anthropic(api_key="your-api-key")
|
222 |
+
|
223 |
+
# Anthropic callback function
|
224 |
+
async def anthropic_callback(prompt, vars):
|
225 |
+
response = client.messages.create(
|
226 |
+
model="claude-2",
|
227 |
+
messages=[{"role": "user", "content": prompt}],
|
228 |
+
max_tokens=150
|
229 |
+
)
|
230 |
+
return response.content[0].text
|
231 |
+
|
232 |
+
# Evaluate with Anthropic
|
233 |
+
eval_results = await evaluator.evaluate_prompt(
|
234 |
+
prompt_id="abc123",
|
235 |
+
inputs=[{"query": "What is machine learning?"}],
|
236 |
+
llm_callback=anthropic_callback
|
237 |
+
)
|
238 |
+
```
|
239 |
+
|
240 |
+
### Hugging Face Integration
|
241 |
+
|
242 |
+
```python
|
243 |
+
from transformers import pipeline
|
244 |
+
import asyncio
|
245 |
+
from promptlab import PromptManager, VersionControl
|
246 |
+
|
247 |
+
prompt_manager = PromptManager()
|
248 |
+
version_control = VersionControl(prompt_manager)
|
249 |
+
|
250 |
+
# Set up Hugging Face pipeline
|
251 |
+
generator = pipeline('text-generation', model='gpt2')
|
252 |
+
|
253 |
+
# Hugging Face callback function
|
254 |
+
async def hf_callback(prompt, vars):
|
255 |
+
# Run synchronously but in a way that doesn't block the asyncio event loop
|
256 |
+
loop = asyncio.get_event_loop()
|
257 |
+
result = await loop.run_in_executor(None, lambda: generator(prompt, max_length=100)[0]['generated_text'])
|
258 |
+
return result
|
259 |
+
|
260 |
+
# Use with version control
|
261 |
+
prompt = prompt_manager.create(
|
262 |
+
content="Complete this: {text}",
|
263 |
+
name="Text Completion"
|
264 |
+
)
|
265 |
+
version_control.commit(prompt.id, "Initial version")
|
266 |
+
|
267 |
+
# Test with different models by swapping the callback
|
268 |
+
```
|
docs/api_reference.md
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# API Reference
|
2 |
+
|
3 |
+
This document provides detailed API documentation for the main components of PromptLab.
|
4 |
+
|
5 |
+
## PromptManager
|
6 |
+
|
7 |
+
The `PromptManager` class is the core component for managing prompts.
|
8 |
+
|
9 |
+
```python
|
10 |
+
from promptlab import PromptManager
|
11 |
+
```
|
12 |
+
|
13 |
+
### Methods
|
14 |
+
|
15 |
+
#### `__init__(storage_path=None)`
|
16 |
+
- **Description**: Initialize a new PromptManager.
|
17 |
+
- **Parameters**:
|
18 |
+
- `storage_path` (str, optional): Path to store prompts. Defaults to "~/promptlab_storage".
|
19 |
+
|
20 |
+
#### `create(content, name, description='', tags=None, metadata=None)`
|
21 |
+
- **Description**: Create a new prompt.
|
22 |
+
- **Parameters**:
|
23 |
+
- `content` (str): The prompt text with optional variables in {variable_name} format.
|
24 |
+
- `name` (str): Name of the prompt.
|
25 |
+
- `description` (str, optional): Description of the prompt.
|
26 |
+
- `tags` (list of str, optional): Tags for categorization.
|
27 |
+
- `metadata` (dict, optional): Additional metadata.
|
28 |
+
- **Returns**: `Prompt` object.
|
29 |
+
|
30 |
+
#### `get(prompt_id)`
|
31 |
+
- **Description**: Get a prompt by ID.
|
32 |
+
- **Parameters**:
|
33 |
+
- `prompt_id` (str): The ID of the prompt.
|
34 |
+
- **Returns**: `Prompt` object or None if not found.
|
35 |
+
|
36 |
+
#### `update(prompt_id, content=None, name=None, description=None, tags=None, metadata=None)`
|
37 |
+
- **Description**: Update a prompt.
|
38 |
+
- **Parameters**:
|
39 |
+
- `prompt_id` (str): The ID of the prompt to update.
|
40 |
+
- `content` (str, optional): New prompt text.
|
41 |
+
- `name` (str, optional): New name.
|
42 |
+
- `description` (str, optional): New description.
|
43 |
+
- `tags` (list of str, optional): New tags.
|
44 |
+
- `metadata` (dict, optional): New metadata.
|
45 |
+
- **Returns**: Updated `Prompt` object.
|
46 |
+
|
47 |
+
#### `delete(prompt_id)`
|
48 |
+
- **Description**: Delete a prompt.
|
49 |
+
- **Parameters**:
|
50 |
+
- `prompt_id` (str): The ID of the prompt to delete.
|
51 |
+
- **Returns**: True if deleted, False otherwise.
|
52 |
+
|
53 |
+
#### `list_all()`
|
54 |
+
- **Description**: List all prompts.
|
55 |
+
- **Returns**: List of `Prompt` objects.
|
56 |
+
|
57 |
+
#### `search_by_tags(tags, match_all=False)`
|
58 |
+
- **Description**: Search prompts by tags.
|
59 |
+
- **Parameters**:
|
60 |
+
- `tags` (list of str): Tags to search for.
|
61 |
+
- `match_all` (bool, optional): If True, prompt must have all tags.
|
62 |
+
- **Returns**: List of matching `Prompt` objects.
|
63 |
+
|
64 |
+
## VersionControl
|
65 |
+
|
66 |
+
The `VersionControl` class manages prompt versions.
|
67 |
+
|
68 |
+
```python
|
69 |
+
from promptlab import VersionControl
|
70 |
+
```
|
71 |
+
|
72 |
+
### Methods
|
73 |
+
|
74 |
+
#### `__init__(prompt_manager)`
|
75 |
+
- **Description**: Initialize the version control system.
|
76 |
+
- **Parameters**:
|
77 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
78 |
+
|
79 |
+
#### `commit(prompt_id, commit_message, metadata=None)`
|
80 |
+
- **Description**: Create a new version of a prompt.
|
81 |
+
- **Parameters**:
|
82 |
+
- `prompt_id` (str): The ID of the prompt.
|
83 |
+
- `commit_message` (str): Message describing the changes.
|
84 |
+
- `metadata` (dict, optional): Additional version metadata.
|
85 |
+
- **Returns**: Version number (int).
|
86 |
+
|
87 |
+
#### `list_versions(prompt_id)`
|
88 |
+
- **Description**: List all versions of a prompt.
|
89 |
+
- **Parameters**:
|
90 |
+
- `prompt_id` (str): The ID of the prompt.
|
91 |
+
- **Returns**: List of version objects.
|
92 |
+
|
93 |
+
#### `get_version(prompt_id, version_number)`
|
94 |
+
- **Description**: Get a specific version of a prompt.
|
95 |
+
- **Parameters**:
|
96 |
+
- `prompt_id` (str): The ID of the prompt.
|
97 |
+
- `version_number` (int): The version number.
|
98 |
+
- **Returns**: Version data.
|
99 |
+
|
100 |
+
#### `checkout(prompt_id, version_number)`
|
101 |
+
- **Description**: Revert a prompt to a specific version.
|
102 |
+
- **Parameters**:
|
103 |
+
- `prompt_id` (str): The ID of the prompt.
|
104 |
+
- `version_number` (int): The version to revert to.
|
105 |
+
- **Returns**: Updated `Prompt` object.
|
106 |
+
|
107 |
+
#### `diff(prompt_id, version1, version2)`
|
108 |
+
- **Description**: Compare two versions of a prompt.
|
109 |
+
- **Parameters**:
|
110 |
+
- `prompt_id` (str): The ID of the prompt.
|
111 |
+
- `version1` (int): First version number.
|
112 |
+
- `version2` (int): Second version number.
|
113 |
+
- **Returns**: Diff object.
|
114 |
+
|
115 |
+
## PromptTesting
|
116 |
+
|
117 |
+
The `PromptTesting` class provides testing capabilities.
|
118 |
+
|
119 |
+
```python
|
120 |
+
from promptlab import PromptTesting
|
121 |
+
```
|
122 |
+
|
123 |
+
### Methods
|
124 |
+
|
125 |
+
#### `__init__(prompt_manager)`
|
126 |
+
- **Description**: Initialize the testing system.
|
127 |
+
- **Parameters**:
|
128 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
129 |
+
|
130 |
+
#### `create_test_case(prompt_id, input_vars, expected_output=None, name=None, description=None)`
|
131 |
+
- **Description**: Create a test case for a prompt.
|
132 |
+
- **Parameters**:
|
133 |
+
- `prompt_id` (str): The ID of the prompt to test.
|
134 |
+
- `input_vars` (dict): Variables to substitute in the prompt.
|
135 |
+
- `expected_output` (str, optional): Expected response.
|
136 |
+
- `name` (str, optional): Test case name.
|
137 |
+
- `description` (str, optional): Test case description.
|
138 |
+
- **Returns**: Test case object.
|
139 |
+
|
140 |
+
#### `run_test_case(test_case_id, llm_callback)`
|
141 |
+
- **Description**: Run a test case.
|
142 |
+
- **Parameters**:
|
143 |
+
- `test_case_id` (str): The ID of the test case.
|
144 |
+
- `llm_callback` (callable): Function to call LLM.
|
145 |
+
- **Returns**: Test result.
|
146 |
+
|
147 |
+
#### `run_all_tests(prompt_id, llm_callback)`
|
148 |
+
- **Description**: Run all tests for a prompt.
|
149 |
+
- **Parameters**:
|
150 |
+
- `prompt_id` (str): The ID of the prompt.
|
151 |
+
- `llm_callback` (callable): Function to call LLM.
|
152 |
+
- **Returns**: List of test results.
|
153 |
+
|
154 |
+
#### `ab_test(prompt_id_a, prompt_id_b, test_cases, llm_callback, metrics=None)`
|
155 |
+
- **Description**: Run A/B tests comparing two prompts.
|
156 |
+
- **Parameters**:
|
157 |
+
- `prompt_id_a` (str): First prompt ID.
|
158 |
+
- `prompt_id_b` (str): Second prompt ID.
|
159 |
+
- `test_cases` (list): Test cases to run.
|
160 |
+
- `llm_callback` (callable): Function to call LLM.
|
161 |
+
- `metrics` (list, optional): Metrics to compare.
|
162 |
+
- **Returns**: A/B test results.
|
163 |
+
|
164 |
+
## Evaluator
|
165 |
+
|
166 |
+
The `Evaluator` class handles prompt evaluation.
|
167 |
+
|
168 |
+
```python
|
169 |
+
from promptlab import Evaluator
|
170 |
+
```
|
171 |
+
|
172 |
+
### Methods
|
173 |
+
|
174 |
+
#### `__init__(prompt_manager)`
|
175 |
+
- **Description**: Initialize the evaluator.
|
176 |
+
- **Parameters**:
|
177 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
178 |
+
|
179 |
+
#### `register_metric(metric)`
|
180 |
+
- **Description**: Register a new evaluation metric.
|
181 |
+
- **Parameters**:
|
182 |
+
- `metric` (EvaluationMetric): The metric to register.
|
183 |
+
|
184 |
+
#### `evaluate_prompt(prompt_id, inputs, llm_callback, expected_outputs=None, metric_names=None)`
|
185 |
+
- **Description**: Evaluate a prompt with the given inputs and metrics.
|
186 |
+
- **Parameters**:
|
187 |
+
- `prompt_id` (str): The ID of the prompt.
|
188 |
+
- `inputs` (list): List of input dictionaries.
|
189 |
+
- `llm_callback` (callable): Function to call LLM.
|
190 |
+
- `expected_outputs` (list, optional): Expected outputs.
|
191 |
+
- `metric_names` (list, optional): Metrics to use.
|
192 |
+
- **Returns**: Evaluation results.
|
193 |
+
|
194 |
+
## PromptTemplate
|
195 |
+
|
196 |
+
The `PromptTemplate` class provides advanced templating.
|
197 |
+
|
198 |
+
```python
|
199 |
+
from promptlab import PromptTemplate
|
200 |
+
```
|
201 |
+
|
202 |
+
### Methods
|
203 |
+
|
204 |
+
#### `__init__(template_string)`
|
205 |
+
- **Description**: Initialize a template.
|
206 |
+
- **Parameters**:
|
207 |
+
- `template_string` (str): Template with variables, conditionals, and loops.
|
208 |
+
|
209 |
+
#### `render(**variables)`
|
210 |
+
- **Description**: Render the template with given variables.
|
211 |
+
- **Parameters**:
|
212 |
+
- `variables` (dict): Variables to substitute.
|
213 |
+
- **Returns**: Rendered string.
|
214 |
+
|
215 |
+
## EvaluationMetric
|
216 |
+
|
217 |
+
The `EvaluationMetric` is the base class for evaluation metrics.
|
218 |
+
|
219 |
+
```python
|
220 |
+
from promptlab import EvaluationMetric
|
221 |
+
```
|
222 |
+
|
223 |
+
### Methods
|
224 |
+
|
225 |
+
#### `__init__(name, description=None)`
|
226 |
+
- **Description**: Initialize a metric.
|
227 |
+
- **Parameters**:
|
228 |
+
- `name` (str): Metric name.
|
229 |
+
- `description` (str, optional): Metric description.
|
230 |
+
|
231 |
+
#### `compute(generated_output, expected_output=None, **kwargs)`
|
232 |
+
- **Description**: Compute the metric score.
|
233 |
+
- **Parameters**:
|
234 |
+
- `generated_output` (str): Output from LLM.
|
235 |
+
- `expected_output` (str, optional): Expected output.
|
236 |
+
- `**kwargs`: Additional parameters.
|
237 |
+
- **Returns**: Score (float between 0 and 1).
|
238 |
+
|
239 |
+
### Built-in Metrics
|
240 |
+
|
241 |
+
- `ExactMatchMetric`: Scores exact matches between generated and expected output.
|
242 |
+
- `ContainsKeywordsMetric`: Scores based on keyword presence.
|
243 |
+
- `LengthMetric`: Scores based on output length.
|
244 |
+
|
245 |
+
```python
|
246 |
+
from promptlab import ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
|
247 |
+
```
|
docs/cli_usage.md
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CLI Usage
|
2 |
+
|
3 |
+
PromptLab provides a command-line interface (CLI) for managing prompts, versions, tests, and evaluations.
|
4 |
+
|
5 |
+
## Basic Commands
|
6 |
+
|
7 |
+
### Prompt Management
|
8 |
+
|
9 |
+
```bash
|
10 |
+
# Create a prompt
|
11 |
+
promptlab prompt create "Weather Forecast" --content "Provide a weather forecast for {location} on {date}" --tags "weather,forecast"
|
12 |
+
|
13 |
+
# List all prompts
|
14 |
+
promptlab prompt list
|
15 |
+
|
16 |
+
# Get prompt details
|
17 |
+
promptlab prompt get <prompt_id>
|
18 |
+
|
19 |
+
# Update a prompt
|
20 |
+
promptlab prompt update <prompt_id> --content "New content" --tags "new,tags"
|
21 |
+
|
22 |
+
# Delete a prompt
|
23 |
+
promptlab prompt delete <prompt_id>
|
24 |
+
```
|
25 |
+
|
26 |
+
### Version Control
|
27 |
+
|
28 |
+
```bash
|
29 |
+
# Commit a version
|
30 |
+
promptlab version commit <prompt_id> --message "Version description"
|
31 |
+
|
32 |
+
# List versions
|
33 |
+
promptlab version list <prompt_id>
|
34 |
+
|
35 |
+
# Check out (revert to) a specific version
|
36 |
+
promptlab version checkout <prompt_id> <version_number>
|
37 |
+
|
38 |
+
# Compare versions
|
39 |
+
promptlab version diff <prompt_id> <version1> <version2>
|
40 |
+
```
|
41 |
+
|
42 |
+
### Testing
|
43 |
+
|
44 |
+
```bash
|
45 |
+
# Create a test case
|
46 |
+
promptlab test create <prompt_id> --input '{"location": "New York", "date": "tomorrow"}' --expected "Expected output"
|
47 |
+
|
48 |
+
# List test cases
|
49 |
+
promptlab test list <prompt_id>
|
50 |
+
|
51 |
+
# Run a specific test case
|
52 |
+
promptlab test run <test_case_id> --llm openai
|
53 |
+
|
54 |
+
# Run all test cases for a prompt
|
55 |
+
promptlab test run-all <prompt_id> --llm openai
|
56 |
+
|
57 |
+
# Run an A/B test between two prompts
|
58 |
+
promptlab test ab <prompt_id_a> <prompt_id_b> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
|
59 |
+
```
|
60 |
+
|
61 |
+
### Evaluation
|
62 |
+
|
63 |
+
```bash
|
64 |
+
# Evaluate a prompt
|
65 |
+
promptlab eval run <prompt_id> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
|
66 |
+
|
67 |
+
# List available metrics
|
68 |
+
promptlab eval metrics
|
69 |
+
|
70 |
+
# Register a custom metric
|
71 |
+
promptlab eval register-metric <metric_file.py>
|
72 |
+
```
|
73 |
+
|
74 |
+
## Environment Configuration
|
75 |
+
|
76 |
+
The CLI supports environment variables for configuration:
|
77 |
+
|
78 |
+
- `PROMPTLAB_STORAGE`: Path to store prompts and related data
|
79 |
+
- `PROMPTLAB_OPENAI_API_KEY`: OpenAI API key for built-in LLM support
|
80 |
+
- `PROMPTLAB_DEFAULT_LLM`: Default LLM to use for testing and evaluation
|
81 |
+
|
82 |
+
You can also create a config file at `~/.promptlab/config.json`:
|
83 |
+
|
84 |
+
```json
|
85 |
+
{
|
86 |
+
"storage_path": "/path/to/storage",
|
87 |
+
"default_llm": "openai",
|
88 |
+
"api_keys": {
|
89 |
+
"openai": "your-openai-key"
|
90 |
+
}
|
91 |
+
}
|
92 |
+
```
|
93 |
+
|
94 |
+
## Advanced Usage
|
95 |
+
|
96 |
+
### Multiple Storage Locations
|
97 |
+
|
98 |
+
```bash
|
99 |
+
# Specify a storage location for a command
|
100 |
+
promptlab --storage /path/to/storage prompt list
|
101 |
+
|
102 |
+
# Export a prompt to another storage
|
103 |
+
promptlab prompt export <prompt_id> --output /path/to/output.json
|
104 |
+
|
105 |
+
# Import a prompt from a file
|
106 |
+
promptlab prompt import /path/to/prompt.json
|
107 |
+
```
|
108 |
+
|
109 |
+
### Automation and Scripting
|
110 |
+
|
111 |
+
```bash
|
112 |
+
# Get output in JSON format
|
113 |
+
promptlab --json prompt list
|
114 |
+
|
115 |
+
# Use in shell scripts
|
116 |
+
PROMPT_ID=$(promptlab --json prompt create "Script Prompt" --content "Content" | jq -r '.id')
|
117 |
+
echo "Created prompt with ID: $PROMPT_ID"
|
118 |
+
```
|
docs/getting_started.md
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Getting Started with PromptLab
|
2 |
+
|
3 |
+
This guide will help you get started with PromptLab, a comprehensive library for managing LLM prompts.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
```bash
|
8 |
+
pip install promptlab
|
9 |
+
```
|
10 |
+
|
11 |
+
## Basic Usage
|
12 |
+
|
13 |
+
### Initialize Components
|
14 |
+
|
15 |
+
```python
|
16 |
+
from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
|
17 |
+
|
18 |
+
# Initialize with default storage location
|
19 |
+
prompt_manager = PromptManager()
|
20 |
+
|
21 |
+
# Or specify a custom storage location
|
22 |
+
# prompt_manager = PromptManager("/path/to/storage")
|
23 |
+
|
24 |
+
# Initialize other components
|
25 |
+
version_control = VersionControl(prompt_manager)
|
26 |
+
testing = PromptTesting(prompt_manager)
|
27 |
+
evaluator = Evaluator(prompt_manager)
|
28 |
+
```
|
29 |
+
|
30 |
+
### Create and Manage Prompts
|
31 |
+
|
32 |
+
```python
|
33 |
+
# Create a prompt
|
34 |
+
prompt = prompt_manager.create(
|
35 |
+
content="Translate the following text from {source_language} to {target_language}: {text}",
|
36 |
+
name="Translation Prompt",
|
37 |
+
description="A prompt for translating text between languages",
|
38 |
+
tags=["translation", "multilingual"]
|
39 |
+
)
|
40 |
+
|
41 |
+
# The prompt.id property contains a unique identifier (e.g., "a1b2c3d4e5")
|
42 |
+
prompt_id = prompt.id
|
43 |
+
|
44 |
+
# Get a prompt by ID
|
45 |
+
retrieved_prompt = prompt_manager.get(prompt_id)
|
46 |
+
|
47 |
+
# Update a prompt
|
48 |
+
prompt_manager.update(
|
49 |
+
prompt_id,
|
50 |
+
content="Please translate the following text from {source_language} to {target_language}:\n\n{text}"
|
51 |
+
)
|
52 |
+
|
53 |
+
# Search prompts by tags
|
54 |
+
translation_prompts = prompt_manager.search_by_tags(["translation"])
|
55 |
+
|
56 |
+
# List all prompts
|
57 |
+
all_prompts = prompt_manager.list_all()
|
58 |
+
```
|
59 |
+
|
60 |
+
### Version Control
|
61 |
+
|
62 |
+
```python
|
63 |
+
# Create a version snapshot
|
64 |
+
version_control.commit(
|
65 |
+
prompt_id=prompt_id,
|
66 |
+
commit_message="Initial version"
|
67 |
+
)
|
68 |
+
|
69 |
+
# Update the prompt and create another version
|
70 |
+
prompt_manager.update(
|
71 |
+
prompt_id,
|
72 |
+
content="Please provide a translation of the following text from {source_language} to {target_language}:\n\n{text}\n\nMaintain the original formatting and tone."
|
73 |
+
)
|
74 |
+
|
75 |
+
version_control.commit(
|
76 |
+
prompt_id=prompt_id,
|
77 |
+
commit_message="Added formatting instructions"
|
78 |
+
)
|
79 |
+
|
80 |
+
# List all versions
|
81 |
+
versions = version_control.list_versions(prompt_id)
|
82 |
+
|
83 |
+
# Compare versions
|
84 |
+
diff = version_control.diff(prompt_id, 1, 2)
|
85 |
+
|
86 |
+
# Revert to a previous version
|
87 |
+
version_control.checkout(prompt_id, 1)
|
88 |
+
```
|
89 |
+
|
90 |
+
### Using Prompts with Variables
|
91 |
+
|
92 |
+
```python
|
93 |
+
# Get a prompt
|
94 |
+
prompt = prompt_manager.get(prompt_id)
|
95 |
+
|
96 |
+
# Render with variables
|
97 |
+
rendered_prompt = prompt.render(
|
98 |
+
source_language="English",
|
99 |
+
target_language="Spanish",
|
100 |
+
text="Hello, how are you today?"
|
101 |
+
)
|
102 |
+
|
103 |
+
# Now use rendered_prompt with your LLM API
|
104 |
+
```
|
105 |
+
|
106 |
+
## Next Steps
|
107 |
+
|
108 |
+
- See the [CLI Usage](./cli_usage.md) guide for command-line operations
|
109 |
+
- Explore [Advanced Features](./advanced_features.md) for templating and custom metrics
|
110 |
+
- Check [Integration Examples](./integration_examples.md) for real-world use cases
|
docs/integration_examples.md
ADDED
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Integration Examples
|
2 |
+
|
3 |
+
This document provides concrete examples of integrating PromptLab into various applications and workflows.
|
4 |
+
|
5 |
+
## Customer Support Chatbot
|
6 |
+
|
7 |
+
### Setup
|
8 |
+
|
9 |
+
```python
|
10 |
+
from promptlab import PromptManager, VersionControl
|
11 |
+
import openai
|
12 |
+
|
13 |
+
# Initialize components
|
14 |
+
prompt_manager = PromptManager()
|
15 |
+
version_control = VersionControl(prompt_manager)
|
16 |
+
|
17 |
+
# Create prompt templates for different scenarios
|
18 |
+
greeting_prompt = prompt_manager.create(
|
19 |
+
content="You are a helpful customer service agent for {company_name}. Greet the customer politely.",
|
20 |
+
name="Customer Greeting",
|
21 |
+
tags=["customer-service", "greeting"]
|
22 |
+
)
|
23 |
+
|
24 |
+
inquiry_prompt = prompt_manager.create(
|
25 |
+
content="""
|
26 |
+
You are a helpful customer service agent for {company_name}.
|
27 |
+
Customer inquiry: {customer_message}
|
28 |
+
|
29 |
+
Based on this inquiry:
|
30 |
+
1. Identify the main issue
|
31 |
+
2. Provide a helpful response
|
32 |
+
3. Offer additional assistance
|
33 |
+
|
34 |
+
Keep your tone professional but friendly.
|
35 |
+
""",
|
36 |
+
name="Customer Inquiry Response",
|
37 |
+
tags=["customer-service", "inquiry"]
|
38 |
+
)
|
39 |
+
|
40 |
+
# Version them
|
41 |
+
version_control.commit(greeting_prompt.id, "Initial version")
|
42 |
+
version_control.commit(inquiry_prompt.id, "Initial version")
|
43 |
+
|
44 |
+
# OpenAI callback
|
45 |
+
def generate_response(prompt_text):
|
46 |
+
response = openai.ChatCompletion.create(
|
47 |
+
model="gpt-3.5-turbo",
|
48 |
+
messages=[{"role": "user", "content": prompt_text}]
|
49 |
+
)
|
50 |
+
return response.choices[0].message.content
|
51 |
+
|
52 |
+
# Main handler function
|
53 |
+
def handle_customer_message(customer_name, message, is_new_conversation):
|
54 |
+
if is_new_conversation:
|
55 |
+
# Use greeting prompt for new conversations
|
56 |
+
prompt = prompt_manager.get(greeting_prompt.id)
|
57 |
+
prompt_text = prompt.render(company_name="Acme Inc.")
|
58 |
+
return generate_response(prompt_text)
|
59 |
+
else:
|
60 |
+
# Use inquiry prompt for ongoing conversations
|
61 |
+
prompt = prompt_manager.get(inquiry_prompt.id)
|
62 |
+
prompt_text = prompt.render(
|
63 |
+
company_name="Acme Inc.",
|
64 |
+
customer_message=message
|
65 |
+
)
|
66 |
+
return generate_response(prompt_text)
|
67 |
+
```
|
68 |
+
|
69 |
+
## Content Generation System
|
70 |
+
|
71 |
+
### Setup
|
72 |
+
|
73 |
+
```python
|
74 |
+
from promptlab import PromptManager, PromptTesting, Evaluator
|
75 |
+
import asyncio
|
76 |
+
|
77 |
+
# Initialize components
|
78 |
+
prompt_manager = PromptManager("content_system_prompts")
|
79 |
+
testing = PromptTesting(prompt_manager)
|
80 |
+
evaluator = Evaluator(prompt_manager)
|
81 |
+
|
82 |
+
# Create content generation prompt
|
83 |
+
blog_prompt = prompt_manager.create(
|
84 |
+
content="""
|
85 |
+
Write a blog post about {topic}.
|
86 |
+
|
87 |
+
Title: {title}
|
88 |
+
|
89 |
+
The post should:
|
90 |
+
- Be approximately {word_count} words
|
91 |
+
- Be written in a {tone} tone
|
92 |
+
- Include {num_sections} main sections
|
93 |
+
- Target audience: {audience}
|
94 |
+
- Include a compelling call-to-action at the end
|
95 |
+
|
96 |
+
Keywords to include: {keywords}
|
97 |
+
""",
|
98 |
+
name="Blog Post Generator",
|
99 |
+
tags=["content", "blog"]
|
100 |
+
)
|
101 |
+
|
102 |
+
# Test cases
|
103 |
+
test_case = testing.create_test_case(
|
104 |
+
prompt_id=blog_prompt.id,
|
105 |
+
input_vars={
|
106 |
+
"topic": "Sustainable Living",
|
107 |
+
"title": "10 Simple Ways to Reduce Your Carbon Footprint",
|
108 |
+
"word_count": "800",
|
109 |
+
"tone": "informative yet casual",
|
110 |
+
"num_sections": "5",
|
111 |
+
"audience": "environmentally-conscious millennials",
|
112 |
+
"keywords": "sustainability, eco-friendly, carbon footprint, climate change, lifestyle changes"
|
113 |
+
}
|
114 |
+
)
|
115 |
+
|
116 |
+
# LLM callback
|
117 |
+
async def content_llm_callback(prompt, vars):
|
118 |
+
# Call your preferred LLM API here
|
119 |
+
# This is a placeholder
|
120 |
+
return f"Generated content about {vars.get('topic', 'unknown topic')}"
|
121 |
+
|
122 |
+
# Content generation function
|
123 |
+
async def generate_content(content_type, parameters):
|
124 |
+
if content_type == "blog":
|
125 |
+
prompt = prompt_manager.get(blog_prompt.id)
|
126 |
+
rendered_prompt = prompt.render(**parameters)
|
127 |
+
|
128 |
+
# Generate content
|
129 |
+
content = await content_llm_callback(rendered_prompt, parameters)
|
130 |
+
|
131 |
+
# Evaluate quality
|
132 |
+
evaluation = await evaluator.evaluate_prompt(
|
133 |
+
prompt_id=blog_prompt.id,
|
134 |
+
inputs=[parameters],
|
135 |
+
llm_callback=content_llm_callback
|
136 |
+
)
|
137 |
+
|
138 |
+
quality_score = evaluation["aggregated_metrics"].get("length", 0)
|
139 |
+
|
140 |
+
return {
|
141 |
+
"content": content,
|
142 |
+
"quality_score": quality_score,
|
143 |
+
"metadata": {
|
144 |
+
"prompt_id": blog_prompt.id,
|
145 |
+
"prompt_version": prompt.version,
|
146 |
+
"parameters": parameters
|
147 |
+
}
|
148 |
+
}
|
149 |
+
else:
|
150 |
+
raise ValueError(f"Unsupported content type: {content_type}")
|
151 |
+
```
|
152 |
+
|
153 |
+
## AI-Assisted Research Tool
|
154 |
+
|
155 |
+
### Setup
|
156 |
+
|
157 |
+
```python
|
158 |
+
from promptlab import PromptManager, VersionControl
|
159 |
+
import json
|
160 |
+
import openai
|
161 |
+
|
162 |
+
# Initialize components
|
163 |
+
prompt_manager = PromptManager("research_prompts")
|
164 |
+
version_control = VersionControl(prompt_manager)
|
165 |
+
|
166 |
+
# Create research prompts
|
167 |
+
article_summary_prompt = prompt_manager.create(
|
168 |
+
content="""
|
169 |
+
Summarize the following research article:
|
170 |
+
|
171 |
+
Title: {article_title}
|
172 |
+
Abstract: {article_abstract}
|
173 |
+
|
174 |
+
Provide a summary that:
|
175 |
+
1. Identifies the main research question
|
176 |
+
2. Outlines the methodology
|
177 |
+
3. Summarizes key findings
|
178 |
+
4. Highlights limitations
|
179 |
+
5. Explains the significance of the results
|
180 |
+
|
181 |
+
Keep the summary concise, approximately 250 words.
|
182 |
+
""",
|
183 |
+
name="Article Summarizer",
|
184 |
+
tags=["research", "summary"]
|
185 |
+
)
|
186 |
+
|
187 |
+
research_question_prompt = prompt_manager.create(
|
188 |
+
content="""
|
189 |
+
Based on the following information:
|
190 |
+
|
191 |
+
Research Area: {research_area}
|
192 |
+
Existing Knowledge: {existing_knowledge}
|
193 |
+
Observed Gap: {knowledge_gap}
|
194 |
+
|
195 |
+
Generate 5 potential research questions that:
|
196 |
+
1. Address the identified knowledge gap
|
197 |
+
2. Are specific and answerable
|
198 |
+
3. Have theoretical or practical significance
|
199 |
+
4. Can be investigated with available research methods
|
200 |
+
""",
|
201 |
+
name="Research Question Generator",
|
202 |
+
tags=["research", "question-generation"]
|
203 |
+
)
|
204 |
+
|
205 |
+
# Version control
|
206 |
+
version_control.commit(article_summary_prompt.id, "Initial version")
|
207 |
+
version_control.commit(research_question_prompt.id, "Initial version")
|
208 |
+
|
209 |
+
# OpenAI callback
|
210 |
+
def research_assistant(prompt_text):
|
211 |
+
response = openai.ChatCompletion.create(
|
212 |
+
model="gpt-4",
|
213 |
+
messages=[{"role": "user", "content": prompt_text}]
|
214 |
+
)
|
215 |
+
return response.choices[0].message.content
|
216 |
+
|
217 |
+
# Research functions
|
218 |
+
def summarize_article(article_title, article_abstract):
|
219 |
+
prompt = prompt_manager.get(article_summary_prompt.id)
|
220 |
+
prompt_text = prompt.render(
|
221 |
+
article_title=article_title,
|
222 |
+
article_abstract=article_abstract
|
223 |
+
)
|
224 |
+
return research_assistant(prompt_text)
|
225 |
+
|
226 |
+
def generate_research_questions(research_area, existing_knowledge, knowledge_gap):
|
227 |
+
prompt = prompt_manager.get(research_question_prompt.id)
|
228 |
+
prompt_text = prompt.render(
|
229 |
+
research_area=research_area,
|
230 |
+
existing_knowledge=existing_knowledge,
|
231 |
+
knowledge_gap=knowledge_gap
|
232 |
+
)
|
233 |
+
return research_assistant(prompt_text)
|
234 |
+
|
235 |
+
# Save results
|
236 |
+
def save_research_data(research_project, data_type, content):
|
237 |
+
# Save the data along with prompt metadata for reproducibility
|
238 |
+
if data_type == "summary":
|
239 |
+
prompt_id = article_summary_prompt.id
|
240 |
+
prompt = prompt_manager.get(prompt_id)
|
241 |
+
elif data_type == "questions":
|
242 |
+
prompt_id = research_question_prompt.id
|
243 |
+
prompt = prompt_manager.get(prompt_id)
|
244 |
+
|
245 |
+
research_data = {
|
246 |
+
"content": content,
|
247 |
+
"metadata": {
|
248 |
+
"prompt_id": prompt_id,
|
249 |
+
"prompt_version": prompt.version,
|
250 |
+
"timestamp": datetime.datetime.now().isoformat()
|
251 |
+
}
|
252 |
+
}
|
253 |
+
|
254 |
+
# Save to file (in real application, might save to database)
|
255 |
+
with open(f"{research_project}_{data_type}.json", "w") as f:
|
256 |
+
json.dump(research_data, f, indent=2)
|
257 |
+
```
|
258 |
+
|
259 |
+
## Educational Quiz Generator
|
260 |
+
|
261 |
+
### Setup
|
262 |
+
|
263 |
+
```python
|
264 |
+
from promptlab import PromptManager, PromptTemplate
|
265 |
+
import asyncio
|
266 |
+
import aiohttp
|
267 |
+
|
268 |
+
# Initialize components
|
269 |
+
prompt_manager = PromptManager("education_prompts")
|
270 |
+
|
271 |
+
# Quiz generation prompt
|
272 |
+
quiz_prompt = prompt_manager.create(
|
273 |
+
content="""
|
274 |
+
Generate a quiz on the topic of {topic} at a {difficulty_level} difficulty level.
|
275 |
+
|
276 |
+
The quiz should:
|
277 |
+
- Have {num_questions} multiple-choice questions
|
278 |
+
- Cover the following subtopics: {subtopics}
|
279 |
+
- Include {include_explanation} explanations for the correct answers
|
280 |
+
- Be appropriate for {grade_level} students
|
281 |
+
|
282 |
+
For each question, provide:
|
283 |
+
1. The question text
|
284 |
+
2. Four possible answers (A, B, C, D)
|
285 |
+
3. The correct answer
|
286 |
+
{if include_explanation == "yes"}
|
287 |
+
4. An explanation of why the answer is correct
|
288 |
+
{endif}
|
289 |
+
|
290 |
+
Format the output as valid JSON.
|
291 |
+
""",
|
292 |
+
name="Quiz Generator",
|
293 |
+
tags=["education", "quiz"]
|
294 |
+
)
|
295 |
+
|
296 |
+
# Quiz rendering template using advanced templating
|
297 |
+
render_template = PromptTemplate("""
|
298 |
+
<h1>{quiz_title}</h1>
|
299 |
+
|
300 |
+
<form id="quiz-form">
|
301 |
+
{for question in questions}
|
302 |
+
<div class="question">
|
303 |
+
<p><strong>Question {question.number}:</strong> {question.text}</p>
|
304 |
+
<ul style="list-style-type: none;">
|
305 |
+
{for option in question.options}
|
306 |
+
<li>
|
307 |
+
<input type="radio" name="q{question.number}" id="q{question.number}_{option.letter}" value="{option.letter}">
|
308 |
+
<label for="q{question.number}_{option.letter}">{option.letter}. {option.text}</label>
|
309 |
+
</li>
|
310 |
+
{endfor}
|
311 |
+
</ul>
|
312 |
+
|
313 |
+
{if show_answers}
|
314 |
+
<div class="answer">
|
315 |
+
<p><strong>Correct Answer:</strong> {question.correct_answer}</p>
|
316 |
+
{if question.has_explanation}
|
317 |
+
<p><strong>Explanation:</strong> {question.explanation}</p>
|
318 |
+
{endif}
|
319 |
+
</div>
|
320 |
+
{endif}
|
321 |
+
</div>
|
322 |
+
{endfor}
|
323 |
+
|
324 |
+
{if !show_answers}
|
325 |
+
<button type="submit">Submit Quiz</button>
|
326 |
+
{endif}
|
327 |
+
</form>
|
328 |
+
""")
|
329 |
+
|
330 |
+
# LLM callback
|
331 |
+
async def education_llm_callback(prompt, vars):
|
332 |
+
# This would call your LLM API
|
333 |
+
# Simulated response for this example
|
334 |
+
await asyncio.sleep(1) # Simulate API call
|
335 |
+
if "quiz" in prompt:
|
336 |
+
return """
|
337 |
+
{
|
338 |
+
"questions": [
|
339 |
+
{
|
340 |
+
"text": "What is the capital of France?",
|
341 |
+
"options": [
|
342 |
+
{"letter": "A", "text": "London"},
|
343 |
+
{"letter": "B", "text": "Berlin"},
|
344 |
+
{"letter": "C", "text": "Paris"},
|
345 |
+
{"letter": "D", "text": "Madrid"}
|
346 |
+
],
|
347 |
+
"correct_answer": "C",
|
348 |
+
"explanation": "Paris is the capital and most populous city of France."
|
349 |
+
},
|
350 |
+
{
|
351 |
+
"text": "Who wrote 'Romeo and Juliet'?",
|
352 |
+
"options": [
|
353 |
+
{"letter": "A", "text": "Charles Dickens"},
|
354 |
+
{"letter": "B", "text": "William Shakespeare"},
|
355 |
+
{"letter": "C", "text": "Jane Austen"},
|
356 |
+
{"letter": "D", "text": "Mark Twain"}
|
357 |
+
],
|
358 |
+
"correct_answer": "B",
|
359 |
+
"explanation": "William Shakespeare wrote 'Romeo and Juliet' around 1594-1596."
|
360 |
+
}
|
361 |
+
]
|
362 |
+
}
|
363 |
+
"""
|
364 |
+
return "Default response"
|
365 |
+
|
366 |
+
# Quiz generation function
|
367 |
+
async def generate_quiz(topic, difficulty, num_questions, grade_level, include_explanations=True):
|
368 |
+
prompt = prompt_manager.get(quiz_prompt.id)
|
369 |
+
rendered_prompt = prompt.render(
|
370 |
+
topic=topic,
|
371 |
+
difficulty_level=difficulty,
|
372 |
+
num_questions=num_questions,
|
373 |
+
subtopics=", ".join(["key concepts", "historical context", "practical applications"]),
|
374 |
+
include_explanation="yes" if include_explanations else "no",
|
375 |
+
grade_level=grade_level
|
376 |
+
)
|
377 |
+
|
378 |
+
# Get quiz content from LLM
|
379 |
+
quiz_json = await education_llm_callback(rendered_prompt, {})
|
380 |
+
|
381 |
+
# Parse JSON
|
382 |
+
quiz_data = json.loads(quiz_json)
|
383 |
+
|
384 |
+
# Prepare data for HTML template
|
385 |
+
template_data = {
|
386 |
+
"quiz_title": f"{topic} Quiz ({difficulty} Level)",
|
387 |
+
"questions": [],
|
388 |
+
"show_answers": False
|
389 |
+
}
|
390 |
+
|
391 |
+
# Format questions
|
392 |
+
for i, q in enumerate(quiz_data["questions"]):
|
393 |
+
question = {
|
394 |
+
"number": i + 1,
|
395 |
+
"text": q["text"],
|
396 |
+
"options": q["options"],
|
397 |
+
"correct_answer": q["correct_answer"],
|
398 |
+
"has_explanation": "explanation" in q,
|
399 |
+
"explanation": q.get("explanation", "")
|
400 |
+
}
|
401 |
+
template_data["questions"].append(question)
|
402 |
+
|
403 |
+
# Render HTML
|
404 |
+
return render_template.render(**template_data)
|
405 |
+
```
|
406 |
+
|
407 |
+
## Automated Coding Assistant
|
408 |
+
|
409 |
+
### Setup
|
410 |
+
|
411 |
+
```python
|
412 |
+
from promptlab import PromptManager, PromptTesting
|
413 |
+
import asyncio
|
414 |
+
import subprocess
|
415 |
+
import tempfile
|
416 |
+
|
417 |
+
# Initialize components
|
418 |
+
prompt_manager = PromptManager("coding_prompts")
|
419 |
+
testing = PromptTesting(prompt_manager)
|
420 |
+
|
421 |
+
# Create code generation prompts
|
422 |
+
function_prompt = prompt_manager.create(
|
423 |
+
content="""
|
424 |
+
Write a {language} function that solves the following problem:
|
425 |
+
|
426 |
+
{problem_description}
|
427 |
+
|
428 |
+
Function signature: {function_signature}
|
429 |
+
|
430 |
+
Requirements:
|
431 |
+
- The function should handle edge cases
|
432 |
+
- Include appropriate comments
|
433 |
+
- Follow {language} best practices
|
434 |
+
- Be optimized for {optimization_goal}
|
435 |
+
|
436 |
+
{if include_tests == "yes"}
|
437 |
+
Also include unit tests for the function.
|
438 |
+
{endif}
|
439 |
+
""",
|
440 |
+
name="Function Generator",
|
441 |
+
tags=["coding", "function"]
|
442 |
+
)
|
443 |
+
|
444 |
+
bug_fix_prompt = prompt_manager.create(
|
445 |
+
content="""
|
446 |
+
Debug the following {language} code which has an issue:
|
447 |
+
|
448 |
+
```{language}
|
449 |
+
{buggy_code}
|
450 |
+
```
|
451 |
+
|
452 |
+
Error message or problem description:
|
453 |
+
{error_description}
|
454 |
+
|
455 |
+
Please:
|
456 |
+
1. Identify the issue
|
457 |
+
2. Explain the root cause
|
458 |
+
3. Provide a fixed version of the code
|
459 |
+
4. Suggest how to prevent similar issues
|
460 |
+
""",
|
461 |
+
name="Bug Fix Assistant",
|
462 |
+
tags=["coding", "debugging"]
|
463 |
+
)
|
464 |
+
|
465 |
+
# LLM callback
|
466 |
+
async def coding_llm_callback(prompt, vars):
|
467 |
+
# This would call your LLM API
|
468 |
+
# Simplified example response
|
469 |
+
await asyncio.sleep(1)
|
470 |
+
|
471 |
+
if "function" in prompt:
|
472 |
+
return """
|
473 |
+
```python
|
474 |
+
def find_max_subarray_sum(arr):
|
475 |
+
"""
|
476 |
+
Finds the maximum sum of any contiguous subarray.
|
477 |
+
Uses Kadane's algorithm with O(n) time complexity.
|
478 |
+
|
479 |
+
Args:
|
480 |
+
arr: List of integers
|
481 |
+
Returns:
|
482 |
+
Maximum subarray sum
|
483 |
+
"""
|
484 |
+
if not arr:
|
485 |
+
return 0
|
486 |
+
|
487 |
+
current_max = global_max = arr[0]
|
488 |
+
|
489 |
+
for num in arr[1:]:
|
490 |
+
current_max = max(num, current_max + num)
|
491 |
+
global_max = max(global_max, current_max)
|
492 |
+
|
493 |
+
return global_max
|
494 |
+
|
495 |
+
# Unit tests
|
496 |
+
def test_find_max_subarray_sum():
|
497 |
+
assert find_max_subarray_sum([]) == 0
|
498 |
+
assert find_max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]) == 6
|
499 |
+
assert find_max_subarray_sum([-1, -2, -3]) == -1
|
500 |
+
print("All tests passed!")
|
501 |
+
```
|
502 |
+
"""
|
503 |
+
elif "debug" in prompt:
|
504 |
+
return """
|
505 |
+
The issue is a classic off-by-one error in the loop boundary.
|
506 |
+
|
507 |
+
Root cause:
|
508 |
+
The loop is using `i <= len(arr)` which accesses an index that's out of bounds.
|
509 |
+
|
510 |
+
Fixed code:
|
511 |
+
```python
|
512 |
+
def process_array(arr):
|
513 |
+
result = []
|
514 |
+
for i in range(len(arr)): # Changed from i <= len(arr)
|
515 |
+
result.append(arr[i] * 2)
|
516 |
+
return result
|
517 |
+
```
|
518 |
+
|
519 |
+
Prevention:
|
520 |
+
- Remember that array indices are 0-based and go up to len(arr)-1
|
521 |
+
- Use range() or enumerate() when iterating through arrays by index
|
522 |
+
- Add bounds checking for critical operations
|
523 |
+
"""
|
524 |
+
|
525 |
+
return "Default response"
|
526 |
+
|
527 |
+
# Function to test generated code
|
528 |
+
def test_generated_code(code, language):
|
529 |
+
"""Test the generated code by running it in a safe environment."""
|
530 |
+
if language.lower() == "python":
|
531 |
+
with tempfile.NamedTemporaryFile(suffix=".py") as temp:
|
532 |
+
temp.write(code.encode())
|
533 |
+
temp.flush()
|
534 |
+
|
535 |
+
try:
|
536 |
+
result = subprocess.run(["python", temp.name],
|
537 |
+
capture_output=True,
|
538 |
+
text=True,
|
539 |
+
timeout=5)
|
540 |
+
if result.returncode == 0:
|
541 |
+
return {"success": True, "output": result.stdout}
|
542 |
+
else:
|
543 |
+
return {"success": False, "error": result.stderr}
|
544 |
+
except subprocess.TimeoutExpired:
|
545 |
+
return {"success": False, "error": "Code execution timed out"}
|
546 |
+
|
547 |
+
return {"success": False, "error": f"Testing not implemented for {language}"}
|
548 |
+
|
549 |
+
# Main coding assistant function
|
550 |
+
async def generate_function(problem, language="python", optimization_goal="readability", include_tests=True):
|
551 |
+
function_name = problem.lower().replace(" ", "_").replace("-", "_")
|
552 |
+
signature = f"def {function_name}(parameters):"
|
553 |
+
|
554 |
+
prompt = prompt_manager.get(function_prompt.id)
|
555 |
+
rendered_prompt = prompt.render(
|
556 |
+
language=language,
|
557 |
+
problem_description=problem,
|
558 |
+
function_signature=signature,
|
559 |
+
optimization_goal=optimization_goal,
|
560 |
+
include_tests="yes" if include_tests else "no"
|
561 |
+
)
|
562 |
+
|
563 |
+
# Get code from LLM
|
564 |
+
generated_code = await coding_llm_callback(rendered_prompt, {})
|
565 |
+
|
566 |
+
# Extract code from markdown if present
|
567 |
+
if "```" in generated_code:
|
568 |
+
code_blocks = re.findall(r"```(?:\w+)?\n(.+?)```", generated_code, re.DOTALL)
|
569 |
+
if code_blocks:
|
570 |
+
clean_code = code_blocks[0]
|
571 |
+
else:
|
572 |
+
clean_code = generated_code
|
573 |
+
else:
|
574 |
+
clean_code = generated_code
|
575 |
+
|
576 |
+
# Test the code
|
577 |
+
test_result = test_generated_code(clean_code, language)
|
578 |
+
|
579 |
+
return {
|
580 |
+
"code": clean_code,
|
581 |
+
"test_result": test_result,
|
582 |
+
"prompt_id": function_prompt.id
|
583 |
+
}
|
584 |
+
```
|
promptlab/__init__.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PromptLab - A comprehensive LLM Prompt Management System
|
3 |
+
|
4 |
+
PromptLab is a Python library that provides tools for managing, versioning,
|
5 |
+
testing, and evaluating prompts for Large Language Models.
|
6 |
+
|
7 |
+
Features:
|
8 |
+
- Prompt management with versioning
|
9 |
+
- A/B testing for prompt optimization
|
10 |
+
- Evaluation framework with customizable metrics
|
11 |
+
- Command-line interface for easy integration
|
12 |
+
"""
|
13 |
+
|
14 |
+
from .core.prompt_manager import PromptManager, Prompt
|
15 |
+
from .core.version_control import VersionControl, PromptVersion
|
16 |
+
from .core.testing import PromptTesting, TestCase, TestResult, ABTestResult
|
17 |
+
from .core.evaluation import Evaluator, EvaluationMetric, ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
|
18 |
+
from .utils.metrics import create_default_metrics_set
|
19 |
+
from .utils.templating import PromptTemplate, template_registry
|
20 |
+
|
21 |
+
__version__ = "0.1.0"
|
22 |
+
__all__ = [
|
23 |
+
"PromptManager",
|
24 |
+
"Prompt",
|
25 |
+
"VersionControl",
|
26 |
+
"PromptVersion",
|
27 |
+
"PromptTesting",
|
28 |
+
"TestCase",
|
29 |
+
"TestResult",
|
30 |
+
"ABTestResult",
|
31 |
+
"Evaluator",
|
32 |
+
"EvaluationMetric",
|
33 |
+
"ExactMatchMetric",
|
34 |
+
"ContainsKeywordsMetric",
|
35 |
+
"LengthMetric",
|
36 |
+
"create_default_metrics_set",
|
37 |
+
"PromptTemplate",
|
38 |
+
"template_registry"
|
39 |
+
]
|
promptlab/cli/__init__.py
ADDED
File without changes
|
promptlab/cli/commands.py
ADDED
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from typing import List, Optional, Dict, Any
|
6 |
+
import asyncio
|
7 |
+
|
8 |
+
from ..core.prompt_manager import PromptManager
|
9 |
+
from ..core.version_control import VersionControl
|
10 |
+
from ..core.testing import PromptTesting
|
11 |
+
from ..core.evaluation import Evaluator, ContainsKeywordsMetric, LengthMetric
|
12 |
+
|
13 |
+
|
14 |
+
class CLI:
|
15 |
+
"""Command-line interface for PromptLab."""
|
16 |
+
def __init__(self):
|
17 |
+
self.prompt_manager = PromptManager()
|
18 |
+
self.version_control = VersionControl(self.prompt_manager)
|
19 |
+
self.testing = PromptTesting(self.prompt_manager)
|
20 |
+
self.evaluator = Evaluator(self.prompt_manager)
|
21 |
+
|
22 |
+
self.parser = argparse.ArgumentParser(description="PromptLab - LLM Prompt Management System")
|
23 |
+
self._setup_commands()
|
24 |
+
|
25 |
+
def _setup_commands(self) -> None:
|
26 |
+
"""Set up command-line arguments."""
|
27 |
+
subparsers = self.parser.add_subparsers(dest="command", help="Command")
|
28 |
+
|
29 |
+
# Prompt commands
|
30 |
+
prompt_parser = subparsers.add_parser("prompt", help="Prompt management")
|
31 |
+
prompt_subparsers = prompt_parser.add_subparsers(dest="subcommand", help="Prompt subcommand")
|
32 |
+
|
33 |
+
# Create prompt
|
34 |
+
create_parser = prompt_subparsers.add_parser("create", help="Create a new prompt")
|
35 |
+
create_parser.add_argument("name", help="Prompt name")
|
36 |
+
create_parser.add_argument("--content", help="Prompt content")
|
37 |
+
create_parser.add_argument("--file", help="File containing prompt content")
|
38 |
+
create_parser.add_argument("--description", help="Prompt description")
|
39 |
+
create_parser.add_argument("--tags", help="Comma-separated list of tags")
|
40 |
+
|
41 |
+
# List prompts
|
42 |
+
# List prompts
|
43 |
+
list_parser = prompt_subparsers.add_parser("list", help="List prompts")
|
44 |
+
list_parser.add_argument("--tags", help="Filter by comma-separated list of tags")
|
45 |
+
|
46 |
+
# Get prompt
|
47 |
+
get_parser = prompt_subparsers.add_parser("get", help="Get a prompt")
|
48 |
+
get_parser.add_argument("id", help="Prompt ID")
|
49 |
+
|
50 |
+
# Update prompt
|
51 |
+
update_parser = prompt_subparsers.add_parser("update", help="Update a prompt")
|
52 |
+
update_parser.add_argument("id", help="Prompt ID")
|
53 |
+
update_parser.add_argument("--content", help="New prompt content")
|
54 |
+
update_parser.add_argument("--file", help="File containing new prompt content")
|
55 |
+
update_parser.add_argument("--name", help="New prompt name")
|
56 |
+
update_parser.add_argument("--description", help="New prompt description")
|
57 |
+
update_parser.add_argument("--tags", help="New comma-separated list of tags")
|
58 |
+
|
59 |
+
# Delete prompt
|
60 |
+
delete_parser = prompt_subparsers.add_parser("delete", help="Delete a prompt")
|
61 |
+
delete_parser.add_argument("id", help="Prompt ID")
|
62 |
+
|
63 |
+
# Version control commands
|
64 |
+
version_parser = subparsers.add_parser("version", help="Version control")
|
65 |
+
version_subparsers = version_parser.add_subparsers(dest="subcommand", help="Version subcommand")
|
66 |
+
|
67 |
+
# Commit
|
68 |
+
commit_parser = version_subparsers.add_parser("commit", help="Create a new version")
|
69 |
+
commit_parser.add_argument("id", help="Prompt ID")
|
70 |
+
commit_parser.add_argument("--message", help="Commit message")
|
71 |
+
|
72 |
+
# List versions
|
73 |
+
list_versions_parser = version_subparsers.add_parser("list", help="List versions")
|
74 |
+
list_versions_parser.add_argument("id", help="Prompt ID")
|
75 |
+
|
76 |
+
# Checkout
|
77 |
+
checkout_parser = version_subparsers.add_parser("checkout", help="Checkout a version")
|
78 |
+
checkout_parser.add_argument("id", help="Prompt ID")
|
79 |
+
checkout_parser.add_argument("version", type=int, help="Version number")
|
80 |
+
|
81 |
+
# Diff
|
82 |
+
diff_parser = version_subparsers.add_parser("diff", help="Compare versions")
|
83 |
+
diff_parser.add_argument("id", help="Prompt ID")
|
84 |
+
diff_parser.add_argument("version1", type=int, help="First version")
|
85 |
+
diff_parser.add_argument("version2", type=int, help="Second version")
|
86 |
+
|
87 |
+
# Testing commands
|
88 |
+
test_parser = subparsers.add_parser("test", help="Testing")
|
89 |
+
test_subparsers = test_parser.add_subparsers(dest="subcommand", help="Test subcommand")
|
90 |
+
|
91 |
+
# Create test case
|
92 |
+
create_test_parser = test_subparsers.add_parser("create", help="Create a test case")
|
93 |
+
create_test_parser.add_argument("prompt_id", help="Prompt ID")
|
94 |
+
create_test_parser.add_argument("--input", help="JSON string of input variables")
|
95 |
+
create_test_parser.add_argument("--input-file", help="File containing JSON input variables")
|
96 |
+
create_test_parser.add_argument("--expected", help="Expected output")
|
97 |
+
create_test_parser.add_argument("--expected-file", help="File containing expected output")
|
98 |
+
create_test_parser.add_argument("--name", help="Test case name")
|
99 |
+
create_test_parser.add_argument("--description", help="Test case description")
|
100 |
+
|
101 |
+
# List test cases
|
102 |
+
list_tests_parser = test_subparsers.add_parser("list", help="List test cases")
|
103 |
+
list_tests_parser.add_argument("--prompt-id", help="Filter by prompt ID")
|
104 |
+
|
105 |
+
# Run test case
|
106 |
+
run_test_parser = test_subparsers.add_parser("run", help="Run a test case")
|
107 |
+
run_test_parser.add_argument("test_id", help="Test case ID")
|
108 |
+
run_test_parser.add_argument("--llm", help="LLM callback function to use")
|
109 |
+
|
110 |
+
# Run all test cases for a prompt
|
111 |
+
run_all_parser = test_subparsers.add_parser("run-all", help="Run all test cases for a prompt")
|
112 |
+
run_all_parser.add_argument("prompt_id", help="Prompt ID")
|
113 |
+
run_all_parser.add_argument("--llm", help="LLM callback function to use")
|
114 |
+
|
115 |
+
# A/B test
|
116 |
+
ab_test_parser = test_subparsers.add_parser("ab", help="Run an A/B test")
|
117 |
+
ab_test_parser.add_argument("prompt_a", help="Prompt A ID")
|
118 |
+
ab_test_parser.add_argument("prompt_b", help="Prompt B ID")
|
119 |
+
ab_test_parser.add_argument("--llm", help="LLM callback function to use")
|
120 |
+
ab_test_parser.add_argument("--test-cases", help="Comma-separated list of test case IDs")
|
121 |
+
|
122 |
+
# Evaluation commands
|
123 |
+
eval_parser = subparsers.add_parser("eval", help="Evaluation")
|
124 |
+
eval_subparsers = eval_parser.add_subparsers(dest="subcommand", help="Evaluation subcommand")
|
125 |
+
|
126 |
+
# List metrics
|
127 |
+
list_metrics_parser = eval_subparsers.add_parser("metrics", help="List evaluation metrics")
|
128 |
+
|
129 |
+
# Register metric
|
130 |
+
register_metric_parser = eval_subparsers.add_parser("register", help="Register a custom metric")
|
131 |
+
register_metric_parser.add_argument("name", help="Metric name")
|
132 |
+
register_metric_parser.add_argument("--keywords", help="Keywords for ContainsKeywordsMetric")
|
133 |
+
register_metric_parser.add_argument("--min-length", type=int, help="Minimum length for LengthMetric")
|
134 |
+
register_metric_parser.add_argument("--max-length", type=int, help="Maximum length for LengthMetric")
|
135 |
+
register_metric_parser.add_argument("--target-length", type=int, help="Target length for LengthMetric")
|
136 |
+
|
137 |
+
# Evaluate prompt
|
138 |
+
evaluate_parser = eval_subparsers.add_parser("run", help="Evaluate a prompt")
|
139 |
+
evaluate_parser.add_argument("prompt_id", help="Prompt ID")
|
140 |
+
evaluate_parser.add_argument("--inputs", help="JSON string of input variables list")
|
141 |
+
evaluate_parser.add_argument("--inputs-file", help="File containing JSON input variables list")
|
142 |
+
evaluate_parser.add_argument("--expected", help="JSON string of expected outputs list")
|
143 |
+
evaluate_parser.add_argument("--expected-file", help="File containing JSON expected outputs list")
|
144 |
+
evaluate_parser.add_argument("--metrics", help="Comma-separated list of metrics to use")
|
145 |
+
evaluate_parser.add_argument("--llm", help="LLM callback function to use")
|
146 |
+
|
147 |
+
def run(self, args: Optional[List[str]] = None) -> None:
|
148 |
+
"""Run the CLI with the given arguments."""
|
149 |
+
args = self.parser.parse_args(args)
|
150 |
+
|
151 |
+
if not args.command:
|
152 |
+
self.parser.print_help()
|
153 |
+
return
|
154 |
+
|
155 |
+
# Handle commands
|
156 |
+
if args.command == "prompt":
|
157 |
+
self._handle_prompt_command(args)
|
158 |
+
elif args.command == "version":
|
159 |
+
self._handle_version_command(args)
|
160 |
+
elif args.command == "test":
|
161 |
+
self._handle_test_command(args)
|
162 |
+
elif args.command == "eval":
|
163 |
+
self._handle_eval_command(args)
|
164 |
+
|
165 |
+
def _handle_prompt_command(self, args) -> None:
|
166 |
+
"""Handle prompt commands."""
|
167 |
+
if not args.subcommand:
|
168 |
+
return
|
169 |
+
|
170 |
+
if args.subcommand == "create":
|
171 |
+
# Get content from file or argument
|
172 |
+
content = ""
|
173 |
+
if args.file:
|
174 |
+
with open(args.file, "r") as f:
|
175 |
+
content = f.read()
|
176 |
+
elif args.content:
|
177 |
+
content = args.content
|
178 |
+
else:
|
179 |
+
print("Error: Must provide either --content or --file")
|
180 |
+
return
|
181 |
+
|
182 |
+
# Parse tags
|
183 |
+
tags = []
|
184 |
+
if args.tags:
|
185 |
+
tags = [tag.strip() for tag in args.tags.split(",")]
|
186 |
+
|
187 |
+
# Create prompt
|
188 |
+
prompt = self.prompt_manager.create(
|
189 |
+
content=content,
|
190 |
+
name=args.name,
|
191 |
+
description=args.description,
|
192 |
+
tags=tags
|
193 |
+
)
|
194 |
+
|
195 |
+
print(f"Created prompt with ID: {prompt.id}")
|
196 |
+
|
197 |
+
elif args.subcommand == "list":
|
198 |
+
# Parse tags
|
199 |
+
tags = None
|
200 |
+
if args.tags:
|
201 |
+
tags = [tag.strip() for tag in args.tags.split(",")]
|
202 |
+
|
203 |
+
# List prompts
|
204 |
+
prompts = self.prompt_manager.list(tags)
|
205 |
+
|
206 |
+
if not prompts:
|
207 |
+
print("No prompts found")
|
208 |
+
return
|
209 |
+
|
210 |
+
# Print prompts
|
211 |
+
print(f"Found {len(prompts)} prompts:")
|
212 |
+
for prompt in prompts:
|
213 |
+
tags_str = ", ".join(prompt.tags) if prompt.tags else ""
|
214 |
+
print(f"ID: {prompt.id} | Name: {prompt.name} | Tags: {tags_str}")
|
215 |
+
|
216 |
+
elif args.subcommand == "get":
|
217 |
+
# Get prompt
|
218 |
+
prompt = self.prompt_manager.get(args.id)
|
219 |
+
|
220 |
+
if not prompt:
|
221 |
+
print(f"Prompt with ID {args.id} not found")
|
222 |
+
return
|
223 |
+
|
224 |
+
# Print prompt
|
225 |
+
print(f"ID: {prompt.id}")
|
226 |
+
print(f"Name: {prompt.name}")
|
227 |
+
print(f"Description: {prompt.description}")
|
228 |
+
print(f"Tags: {', '.join(prompt.tags)}")
|
229 |
+
print(f"Version: {prompt.version}")
|
230 |
+
print(f"Created: {prompt.created_at}")
|
231 |
+
print(f"Updated: {prompt.updated_at}")
|
232 |
+
print("\nContent:")
|
233 |
+
print(prompt.content)
|
234 |
+
|
235 |
+
elif args.subcommand == "update":
|
236 |
+
# Get prompt
|
237 |
+
prompt = self.prompt_manager.get(args.id)
|
238 |
+
|
239 |
+
if not prompt:
|
240 |
+
print(f"Prompt with ID {args.id} not found")
|
241 |
+
return
|
242 |
+
|
243 |
+
# Update kwargs
|
244 |
+
kwargs = {}
|
245 |
+
|
246 |
+
if args.name:
|
247 |
+
kwargs["name"] = args.name
|
248 |
+
|
249 |
+
if args.description:
|
250 |
+
kwargs["description"] = args.description
|
251 |
+
|
252 |
+
if args.tags:
|
253 |
+
kwargs["tags"] = [tag.strip() for tag in args.tags.split(",")]
|
254 |
+
|
255 |
+
# Get content from file or argument
|
256 |
+
if args.file:
|
257 |
+
with open(args.file, "r") as f:
|
258 |
+
kwargs["content"] = f.read()
|
259 |
+
elif args.content:
|
260 |
+
kwargs["content"] = args.content
|
261 |
+
|
262 |
+
# Update prompt
|
263 |
+
prompt = self.prompt_manager.update(args.id, **kwargs)
|
264 |
+
|
265 |
+
print(f"Updated prompt with ID: {prompt.id}")
|
266 |
+
|
267 |
+
elif args.subcommand == "delete":
|
268 |
+
# Delete prompt
|
269 |
+
success = self.prompt_manager.delete(args.id)
|
270 |
+
|
271 |
+
if success:
|
272 |
+
print(f"Deleted prompt with ID: {args.id}")
|
273 |
+
else:
|
274 |
+
print(f"Prompt with ID {args.id} not found")
|
275 |
+
|
276 |
+
def _handle_version_command(self, args) -> None:
|
277 |
+
"""Handle version control commands."""
|
278 |
+
if not args.subcommand:
|
279 |
+
return
|
280 |
+
|
281 |
+
if args.subcommand == "commit":
|
282 |
+
# Commit version
|
283 |
+
version = self.version_control.commit(
|
284 |
+
prompt_id=args.id,
|
285 |
+
commit_message=args.message
|
286 |
+
)
|
287 |
+
|
288 |
+
if not version:
|
289 |
+
print(f"Prompt with ID {args.id} not found")
|
290 |
+
return
|
291 |
+
|
292 |
+
print(f"Committed version {version.version} for prompt {args.id}")
|
293 |
+
|
294 |
+
elif args.subcommand == "list":
|
295 |
+
# List versions
|
296 |
+
versions = self.version_control.list_versions(args.id)
|
297 |
+
|
298 |
+
if not versions:
|
299 |
+
print(f"No versions found for prompt {args.id}")
|
300 |
+
return
|
301 |
+
|
302 |
+
# Print versions
|
303 |
+
print(f"Found {len(versions)} versions for prompt {args.id}:")
|
304 |
+
for version in versions:
|
305 |
+
message = version.commit_message or "No commit message"
|
306 |
+
print(f"Version: {version.version} | Created: {version.created_at} | Message: {message}")
|
307 |
+
|
308 |
+
elif args.subcommand == "checkout":
|
309 |
+
# Checkout version
|
310 |
+
prompt = self.version_control.checkout(
|
311 |
+
prompt_id=args.id,
|
312 |
+
version=args.version
|
313 |
+
)
|
314 |
+
|
315 |
+
if not prompt:
|
316 |
+
print(f"Prompt with ID {args.id} or version {args.version} not found")
|
317 |
+
return
|
318 |
+
|
319 |
+
print(f"Checked out version {args.version} for prompt {args.id}")
|
320 |
+
|
321 |
+
elif args.subcommand == "diff":
|
322 |
+
# Diff versions
|
323 |
+
diff = self.version_control.diff(
|
324 |
+
prompt_id=args.id,
|
325 |
+
version1=args.version1,
|
326 |
+
version2=args.version2
|
327 |
+
)
|
328 |
+
|
329 |
+
if not diff:
|
330 |
+
print(f"Could not compare versions {args.version1} and {args.version2} for prompt {args.id}")
|
331 |
+
return
|
332 |
+
|
333 |
+
# Print diff
|
334 |
+
print(f"Diff between version {args.version1} and {args.version2} for prompt {args.id}:")
|
335 |
+
for line in diff["diff"]:
|
336 |
+
print(line)
|
337 |
+
|
338 |
+
def _handle_test_command(self, args) -> None:
|
339 |
+
"""Handle testing commands."""
|
340 |
+
if not args.subcommand:
|
341 |
+
return
|
342 |
+
|
343 |
+
if args.subcommand == "create":
|
344 |
+
# Parse input variables
|
345 |
+
input_vars = {}
|
346 |
+
if args.input:
|
347 |
+
input_vars = json.loads(args.input)
|
348 |
+
elif args.input_file:
|
349 |
+
with open(args.input_file, "r") as f:
|
350 |
+
input_vars = json.loads(f.read())
|
351 |
+
else:
|
352 |
+
print("Error: Must provide either --input or --input-file")
|
353 |
+
return
|
354 |
+
|
355 |
+
# Parse expected output
|
356 |
+
expected = None
|
357 |
+
if args.expected:
|
358 |
+
expected = args.expected
|
359 |
+
elif args.expected_file:
|
360 |
+
with open(args.expected_file, "r") as f:
|
361 |
+
expected = f.read()
|
362 |
+
|
363 |
+
# Create test case
|
364 |
+
test_case = self.testing.create_test_case(
|
365 |
+
prompt_id=args.prompt_id,
|
366 |
+
input_vars=input_vars,
|
367 |
+
expected_output=expected,
|
368 |
+
name=args.name,
|
369 |
+
description=args.description
|
370 |
+
)
|
371 |
+
|
372 |
+
print(f"Created test case with ID: {test_case.id}")
|
373 |
+
|
374 |
+
elif args.subcommand == "list":
|
375 |
+
# List test cases
|
376 |
+
test_cases = self.testing.list_test_cases(args.prompt_id)
|
377 |
+
|
378 |
+
if not test_cases:
|
379 |
+
print("No test cases found")
|
380 |
+
return
|
381 |
+
|
382 |
+
# Print test cases
|
383 |
+
print(f"Found {len(test_cases)} test cases:")
|
384 |
+
for tc in test_cases:
|
385 |
+
print(f"ID: {tc.id} | Name: {tc.name} | Prompt ID: {tc.prompt_id}")
|
386 |
+
|
387 |
+
elif args.subcommand == "run":
|
388 |
+
# Get LLM callback
|
389 |
+
llm_callback = self._get_llm_callback(args.llm)
|
390 |
+
|
391 |
+
# Run test case
|
392 |
+
asyncio.run(self._run_test_case(args.test_id, llm_callback))
|
393 |
+
|
394 |
+
elif args.subcommand == "run-all":
|
395 |
+
# Get LLM callback
|
396 |
+
llm_callback = self._get_llm_callback(args.llm)
|
397 |
+
|
398 |
+
# Run all test cases
|
399 |
+
asyncio.run(self._run_all_test_cases(args.prompt_id, llm_callback))
|
400 |
+
|
401 |
+
elif args.subcommand == "ab":
|
402 |
+
# Get LLM callback
|
403 |
+
llm_callback = self._get_llm_callback(args.llm)
|
404 |
+
|
405 |
+
# Parse test case IDs
|
406 |
+
test_cases = None
|
407 |
+
if args.test_cases:
|
408 |
+
test_cases = [tc.strip() for tc in args.test_cases.split(",")]
|
409 |
+
|
410 |
+
# Run A/B test
|
411 |
+
asyncio.run(self._run_ab_test(args.prompt_a, args.prompt_b, llm_callback, test_cases))
|
412 |
+
|
413 |
+
async def _run_test_case(self, test_case_id, llm_callback) -> None:
|
414 |
+
"""Run a test case."""
|
415 |
+
try:
|
416 |
+
metrics_callbacks = [
|
417 |
+
self._create_metrics_callback("exact_match"),
|
418 |
+
self._create_metrics_callback("similarity"),
|
419 |
+
self._create_metrics_callback("length")
|
420 |
+
]
|
421 |
+
|
422 |
+
result = await self.testing.run_test_case(
|
423 |
+
test_case_id=test_case_id,
|
424 |
+
llm_callback=llm_callback,
|
425 |
+
metrics_callbacks=metrics_callbacks
|
426 |
+
)
|
427 |
+
|
428 |
+
print(f"Test result ID: {result.id}")
|
429 |
+
print(f"Test case ID: {result.test_case_id}")
|
430 |
+
print(f"Prompt ID: {result.prompt_id}")
|
431 |
+
print(f"Prompt version: {result.prompt_version}")
|
432 |
+
print(f"Passed: {result.passed}")
|
433 |
+
|
434 |
+
if result.metrics:
|
435 |
+
print("\nMetrics:")
|
436 |
+
for name, value in result.metrics.items():
|
437 |
+
print(f"{name}: {value}")
|
438 |
+
|
439 |
+
print("\nOutput:")
|
440 |
+
print(result.output)
|
441 |
+
except Exception as e:
|
442 |
+
print(f"Error running test case: {e}")
|
443 |
+
|
444 |
+
async def _run_all_test_cases(self, prompt_id, llm_callback) -> None:
|
445 |
+
"""Run all test cases for a prompt."""
|
446 |
+
try:
|
447 |
+
metrics_callbacks = [
|
448 |
+
self._create_metrics_callback("exact_match"),
|
449 |
+
self._create_metrics_callback("similarity"),
|
450 |
+
self._create_metrics_callback("length")
|
451 |
+
]
|
452 |
+
|
453 |
+
results = await self.testing.run_test_cases(
|
454 |
+
prompt_id=prompt_id,
|
455 |
+
llm_callback=llm_callback,
|
456 |
+
metrics_callbacks=metrics_callbacks
|
457 |
+
)
|
458 |
+
|
459 |
+
print(f"Ran {len(results)} test cases for prompt {prompt_id}")
|
460 |
+
|
461 |
+
# Calculate aggregate metrics
|
462 |
+
if results:
|
463 |
+
passed = sum(1 for r in results if r.passed)
|
464 |
+
print(f"Passed: {passed}/{len(results)} ({passed/len(results)*100:.2f}%)")
|
465 |
+
|
466 |
+
# Aggregate metrics
|
467 |
+
metrics = {}
|
468 |
+
for r in results:
|
469 |
+
for name, value in r.metrics.items():
|
470 |
+
if name not in metrics:
|
471 |
+
metrics[name] = []
|
472 |
+
metrics[name].append(value)
|
473 |
+
|
474 |
+
print("\nAggregate metrics:")
|
475 |
+
for name, values in metrics.items():
|
476 |
+
avg = sum(values) / len(values)
|
477 |
+
print(f"{name}: {avg:.4f}")
|
478 |
+
except Exception as e:
|
479 |
+
print(f"Error running test cases: {e}")
|
480 |
+
|
481 |
+
async def _run_ab_test(self, prompt_a_id, prompt_b_id, llm_callback, test_cases) -> None:
|
482 |
+
"""Run an A/B test."""
|
483 |
+
try:
|
484 |
+
metrics_callbacks = [
|
485 |
+
self._create_metrics_callback("exact_match"),
|
486 |
+
self._create_metrics_callback("similarity"),
|
487 |
+
self._create_metrics_callback("length")
|
488 |
+
]
|
489 |
+
|
490 |
+
result = await self.testing.run_ab_test(
|
491 |
+
prompt_a_id=prompt_a_id,
|
492 |
+
prompt_b_id=prompt_b_id,
|
493 |
+
llm_callback=llm_callback,
|
494 |
+
metrics_callbacks=metrics_callbacks,
|
495 |
+
test_cases=test_cases
|
496 |
+
)
|
497 |
+
|
498 |
+
print(f"A/B test result ID: {result.id}")
|
499 |
+
print(f"Prompt A ID: {result.prompt_a_id}")
|
500 |
+
print(f"Prompt B ID: {result.prompt_b_id}")
|
501 |
+
print(f"Winner: {result.winner or 'Tie'}")
|
502 |
+
|
503 |
+
print("\nPrompt A metrics:")
|
504 |
+
for name, value in result.metrics_a.items():
|
505 |
+
print(f"{name}: {value:.4f}")
|
506 |
+
|
507 |
+
print("\nPrompt B metrics:")
|
508 |
+
for name, value in result.metrics_b.items():
|
509 |
+
print(f"{name}: {value:.4f}")
|
510 |
+
except Exception as e:
|
511 |
+
print(f"Error running A/B test: {e}")
|
512 |
+
|
513 |
+
def _handle_eval_command(self, args) -> None:
|
514 |
+
"""Handle evaluation commands."""
|
515 |
+
if not args.subcommand:
|
516 |
+
return
|
517 |
+
|
518 |
+
if args.subcommand == "metrics":
|
519 |
+
# List metrics
|
520 |
+
metrics = self.evaluator.list_metrics()
|
521 |
+
|
522 |
+
if not metrics:
|
523 |
+
print("No metrics registered")
|
524 |
+
return
|
525 |
+
|
526 |
+
# Print metrics
|
527 |
+
print(f"Found {len(metrics)} metrics:")
|
528 |
+
for metric in metrics:
|
529 |
+
print(f"Name: {metric.name} | Description: {metric.description}")
|
530 |
+
|
531 |
+
elif args.subcommand == "register":
|
532 |
+
# Register custom metric
|
533 |
+
if args.keywords:
|
534 |
+
# Register ContainsKeywordsMetric
|
535 |
+
keywords = [k.strip() for k in args.keywords.split(",")]
|
536 |
+
metric = ContainsKeywordsMetric(keywords)
|
537 |
+
self.evaluator.register_metric(metric)
|
538 |
+
print(f"Registered ContainsKeywordsMetric with name: {metric.name}")
|
539 |
+
elif args.min_length is not None or args.max_length is not None or args.target_length is not None:
|
540 |
+
# Register LengthMetric
|
541 |
+
metric = LengthMetric(
|
542 |
+
min_length=args.min_length,
|
543 |
+
max_length=args.max_length,
|
544 |
+
target_length=args.target_length
|
545 |
+
)
|
546 |
+
self.evaluator.register_metric(metric)
|
547 |
+
print(f"Registered LengthMetric with name: {metric.name}")
|
548 |
+
else:
|
549 |
+
print("Error: Must provide either --keywords, --min-length, --max-length, or --target-length")
|
550 |
+
|
551 |
+
elif args.subcommand == "run":
|
552 |
+
# Parse inputs
|
553 |
+
inputs = []
|
554 |
+
if args.inputs:
|
555 |
+
inputs = json.loads(args.inputs)
|
556 |
+
elif args.inputs_file:
|
557 |
+
with open(args.inputs_file, "r") as f:
|
558 |
+
inputs = json.loads(f.read())
|
559 |
+
else:
|
560 |
+
print("Error: Must provide either --inputs or --inputs-file")
|
561 |
+
return
|
562 |
+
|
563 |
+
# Parse expected outputs
|
564 |
+
expected_outputs = None
|
565 |
+
if args.expected:
|
566 |
+
expected_outputs = json.loads(args.expected)
|
567 |
+
elif args.expected_file:
|
568 |
+
with open(args.expected_file, "r") as f:
|
569 |
+
expected_outputs = json.loads(f.read())
|
570 |
+
|
571 |
+
# Parse metrics
|
572 |
+
metric_names = None
|
573 |
+
if args.metrics:
|
574 |
+
metric_names = [m.strip() for m in args.metrics.split(",")]
|
575 |
+
|
576 |
+
# Get LLM callback
|
577 |
+
llm_callback = self._get_llm_callback(args.llm)
|
578 |
+
|
579 |
+
# Run evaluation
|
580 |
+
asyncio.run(self._run_evaluation(
|
581 |
+
args.prompt_id,
|
582 |
+
inputs,
|
583 |
+
expected_outputs,
|
584 |
+
metric_names,
|
585 |
+
llm_callback
|
586 |
+
))
|
587 |
+
|
588 |
+
async def _run_evaluation(self, prompt_id, inputs, expected_outputs, metric_names, llm_callback) -> None:
|
589 |
+
"""Run an evaluation."""
|
590 |
+
try:
|
591 |
+
result = await self.evaluator.evaluate_prompt(
|
592 |
+
prompt_id=prompt_id,
|
593 |
+
inputs=inputs,
|
594 |
+
llm_callback=llm_callback,
|
595 |
+
expected_outputs=expected_outputs,
|
596 |
+
metric_names=metric_names
|
597 |
+
)
|
598 |
+
|
599 |
+
print(f"Evaluated prompt {prompt_id} with {result['num_samples']} samples")
|
600 |
+
|
601 |
+
# Print aggregated metrics
|
602 |
+
print("\nAggregated metrics:")
|
603 |
+
for name, value in result["aggregated_metrics"].items():
|
604 |
+
print(f"{name}: {value:.4f}")
|
605 |
+
|
606 |
+
# Print individual results
|
607 |
+
print("\nIndividual results:")
|
608 |
+
for i, r in enumerate(result["individual_results"]):
|
609 |
+
print(f"\nSample {i+1}:")
|
610 |
+
print(f"Input: {json.dumps(r['input'])}")
|
611 |
+
print(f"Output: {r['output']}")
|
612 |
+
if r["expected"]:
|
613 |
+
print(f"Expected: {r['expected']}")
|
614 |
+
|
615 |
+
print("Metrics:")
|
616 |
+
for name, value in r["metrics"].items():
|
617 |
+
print(f"{name}: {value:.4f}")
|
618 |
+
except Exception as e:
|
619 |
+
print(f"Error running evaluation: {e}")
|
620 |
+
|
621 |
+
def _get_llm_callback(self, llm_name: Optional[str]) -> callable:
|
622 |
+
"""Get an LLM callback function."""
|
623 |
+
# Default to a simple echo function for testing
|
624 |
+
if not llm_name or llm_name == "echo":
|
625 |
+
async def echo_callback(prompt, vars):
|
626 |
+
return f"Echo: {prompt}"
|
627 |
+
return echo_callback
|
628 |
+
|
629 |
+
# Add more LLM callbacks as needed
|
630 |
+
if llm_name == "openai":
|
631 |
+
# Example implementation using OpenAI
|
632 |
+
try:
|
633 |
+
import openai
|
634 |
+
|
635 |
+
async def openai_callback(prompt, vars):
|
636 |
+
response = await openai.Completion.acreate(
|
637 |
+
model="text-davinci-003",
|
638 |
+
prompt=prompt,
|
639 |
+
max_tokens=1000
|
640 |
+
)
|
641 |
+
return response.choices[0].text.strip()
|
642 |
+
|
643 |
+
return openai_callback
|
644 |
+
except ImportError:
|
645 |
+
print("Error: OpenAI package not installed. Run `pip install openai` to use this LLM.")
|
646 |
+
sys.exit(1)
|
647 |
+
|
648 |
+
# Add more LLM implementations as needed
|
649 |
+
|
650 |
+
print(f"Error: Unknown LLM callback: {llm_name}")
|
651 |
+
sys.exit(1)
|
652 |
+
|
653 |
+
def _create_metrics_callback(self, metric_type: str) -> callable:
|
654 |
+
"""Create a metrics callback function."""
|
655 |
+
# Simple metrics
|
656 |
+
if metric_type == "exact_match":
|
657 |
+
def exact_match_callback(output, expected):
|
658 |
+
if not expected:
|
659 |
+
return {"exact_match": 0.0}
|
660 |
+
return {"exact_match": 1.0 if output.strip() == expected.strip() else 0.0}
|
661 |
+
return exact_match_callback
|
662 |
+
|
663 |
+
elif metric_type == "similarity":
|
664 |
+
from difflib import SequenceMatcher
|
665 |
+
|
666 |
+
def similarity_callback(output, expected):
|
667 |
+
if not expected:
|
668 |
+
return {"similarity": 0.0}
|
669 |
+
return {"similarity": SequenceMatcher(None, output, expected).ratio()}
|
670 |
+
return similarity_callback
|
671 |
+
|
672 |
+
elif metric_type == "length":
|
673 |
+
def length_callback(output, expected):
|
674 |
+
out_len = len(output)
|
675 |
+
if not expected:
|
676 |
+
return {"length": 1.0 if out_len > 0 else 0.0}
|
677 |
+
|
678 |
+
exp_len = len(expected)
|
679 |
+
if exp_len == 0:
|
680 |
+
return {"length": 1.0 if out_len == 0 else 0.0}
|
681 |
+
|
682 |
+
# Return score inversely proportional to the difference
|
683 |
+
ratio = min(out_len / exp_len, exp_len / out_len)
|
684 |
+
return {"length": ratio}
|
685 |
+
return length_callback
|
686 |
+
|
687 |
+
# Default no-op metric
|
688 |
+
return lambda output, expected: {}
|
689 |
+
|
690 |
+
|
691 |
+
def main():
|
692 |
+
"""Main entry point for the CLI."""
|
693 |
+
CLI().run()
|
694 |
+
|
695 |
+
|
696 |
+
if __name__ == "__main__":
|
697 |
+
main()
|
promptlab/core/__init__.py
ADDED
File without changes
|
promptlab/core/evaluation.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import datetime
|
4 |
+
from typing import Dict, List, Optional, Any, Callable, Union, Awaitable
|
5 |
+
import asyncio
|
6 |
+
from .prompt_manager import PromptManager, Prompt
|
7 |
+
|
8 |
+
class EvaluationMetric:
|
9 |
+
"""Base class for evaluation metrics."""
|
10 |
+
def __init__(self, name: str, description: Optional[str] = None):
|
11 |
+
self.name = name
|
12 |
+
self.description = description or ""
|
13 |
+
|
14 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
15 |
+
"""Compute the metric. Must be implemented by subclasses."""
|
16 |
+
raise NotImplementedError("Subclasses must implement compute method")
|
17 |
+
|
18 |
+
class ExactMatchMetric(EvaluationMetric):
|
19 |
+
"""Evaluates exact match between generated and expected output."""
|
20 |
+
def __init__(self):
|
21 |
+
super().__init__("exact_match", "Exact match between generated and expected output")
|
22 |
+
|
23 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
24 |
+
"""Return 1.0 if generated matches expected exactly, 0.0 otherwise."""
|
25 |
+
if not expected_output:
|
26 |
+
return 0.0
|
27 |
+
return 1.0 if generated_output.strip() == expected_output.strip() else 0.0
|
28 |
+
|
29 |
+
class ContainsKeywordsMetric(EvaluationMetric):
|
30 |
+
"""Evaluates if the generated output contains specified keywords."""
|
31 |
+
def __init__(self, keywords: List[str], case_sensitive: bool = False):
|
32 |
+
super().__init__(
|
33 |
+
"contains_keywords",
|
34 |
+
f"Check if output contains keywords: {', '.join(keywords)}"
|
35 |
+
)
|
36 |
+
self.keywords = keywords
|
37 |
+
self.case_sensitive = case_sensitive
|
38 |
+
|
39 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
40 |
+
"""Return percentage of keywords found in the output."""
|
41 |
+
if not self.keywords:
|
42 |
+
return 0.0
|
43 |
+
|
44 |
+
if not self.case_sensitive:
|
45 |
+
generated_output = generated_output.lower()
|
46 |
+
keywords = [k.lower() for k in self.keywords]
|
47 |
+
else:
|
48 |
+
keywords = self.keywords
|
49 |
+
|
50 |
+
matches = sum(1 for k in keywords if k in generated_output)
|
51 |
+
return matches / len(keywords)
|
52 |
+
|
53 |
+
class LengthMetric(EvaluationMetric):
|
54 |
+
"""Evaluates if the generated output length is within the desired range."""
|
55 |
+
def __init__(self, min_length: Optional[int] = None, max_length: Optional[int] = None, target_length: Optional[int] = None):
|
56 |
+
description = "Evaluate output length"
|
57 |
+
if target_length is not None:
|
58 |
+
description = f"Evaluate if output length is close to {target_length} characters"
|
59 |
+
elif min_length is not None and max_length is not None:
|
60 |
+
description = f"Evaluate if output length is between {min_length} and {max_length} characters"
|
61 |
+
elif min_length is not None:
|
62 |
+
description = f"Evaluate if output length is at least {min_length} characters"
|
63 |
+
elif max_length is not None:
|
64 |
+
description = f"Evaluate if output length is at most {max_length} characters"
|
65 |
+
|
66 |
+
super().__init__("length", description)
|
67 |
+
self.min_length = min_length
|
68 |
+
self.max_length = max_length
|
69 |
+
self.target_length = target_length
|
70 |
+
|
71 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
72 |
+
"""Return score based on length conditions."""
|
73 |
+
length = len(generated_output)
|
74 |
+
|
75 |
+
if self.target_length is not None:
|
76 |
+
# Score inversely proportional to the distance from target
|
77 |
+
max_distance = self.target_length # Normalize to a max distance
|
78 |
+
distance = abs(length - self.target_length)
|
79 |
+
return max(0, 1 - (distance / max_distance))
|
80 |
+
|
81 |
+
# Check if within bounds
|
82 |
+
within_min = self.min_length is None or length >= self.min_length
|
83 |
+
within_max = self.max_length is None or length <= self.max_length
|
84 |
+
|
85 |
+
if within_min and within_max:
|
86 |
+
return 1.0
|
87 |
+
elif within_min and self.max_length:
|
88 |
+
# Over max length, calculate proportional penalty
|
89 |
+
return max(0, 1 - ((length - self.max_length) / self.max_length))
|
90 |
+
elif within_max and self.min_length:
|
91 |
+
# Under min length, calculate proportional penalty
|
92 |
+
return max(0, length / self.min_length)
|
93 |
+
return 0.0
|
94 |
+
|
95 |
+
class Evaluator:
|
96 |
+
"""Manages evaluation metrics and evaluation runs."""
|
97 |
+
def __init__(self, prompt_manager: PromptManager):
|
98 |
+
self.prompt_manager = prompt_manager
|
99 |
+
self.metrics: Dict[str, EvaluationMetric] = {}
|
100 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "evaluations")
|
101 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
102 |
+
|
103 |
+
# Register built-in metrics
|
104 |
+
self.register_metric(ExactMatchMetric())
|
105 |
+
self.register_metric(ContainsKeywordsMetric(["important", "critical", "necessary"]))
|
106 |
+
self.register_metric(LengthMetric(min_length=50, max_length=500))
|
107 |
+
|
108 |
+
def register_metric(self, metric: EvaluationMetric) -> None:
|
109 |
+
"""Register a new evaluation metric."""
|
110 |
+
self.metrics[metric.name] = metric
|
111 |
+
|
112 |
+
def get_metric(self, name: str) -> Optional[EvaluationMetric]:
|
113 |
+
"""Get a registered metric by name."""
|
114 |
+
return self.metrics.get(name)
|
115 |
+
|
116 |
+
def list_metrics(self) -> List[EvaluationMetric]:
|
117 |
+
"""List all registered metrics."""
|
118 |
+
return list(self.metrics.values())
|
119 |
+
|
120 |
+
async def evaluate_prompt(
|
121 |
+
self,
|
122 |
+
prompt_id: str,
|
123 |
+
inputs: List[Dict[str, Any]],
|
124 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
125 |
+
expected_outputs: Optional[List[Optional[str]]] = None,
|
126 |
+
metric_names: Optional[List[str]] = None
|
127 |
+
) -> Dict[str, Any]:
|
128 |
+
"""Evaluate a prompt with the given inputs and metrics."""
|
129 |
+
prompt = self.prompt_manager.get(prompt_id)
|
130 |
+
if not prompt:
|
131 |
+
raise ValueError(f"Prompt with ID {prompt_id} not found")
|
132 |
+
|
133 |
+
# Use all registered metrics if none specified
|
134 |
+
if not metric_names:
|
135 |
+
metrics_to_use = list(self.metrics.values())
|
136 |
+
else:
|
137 |
+
metrics_to_use = [self.get_metric(name) for name in metric_names if self.get_metric(name)]
|
138 |
+
|
139 |
+
if not metrics_to_use:
|
140 |
+
raise ValueError("No valid metrics specified")
|
141 |
+
|
142 |
+
# Ensure expected_outputs is the same length as inputs
|
143 |
+
if expected_outputs is None:
|
144 |
+
expected_outputs = [None] * len(inputs)
|
145 |
+
elif len(expected_outputs) != len(inputs):
|
146 |
+
raise ValueError("Expected outputs must be the same length as inputs")
|
147 |
+
|
148 |
+
results = []
|
149 |
+
for i, (input_vars, expected) in enumerate(zip(inputs, expected_outputs)):
|
150 |
+
# Render the prompt
|
151 |
+
rendered_prompt = prompt.render(**input_vars)
|
152 |
+
|
153 |
+
# Generate output
|
154 |
+
if asyncio.iscoroutinefunction(llm_callback):
|
155 |
+
output = await llm_callback(rendered_prompt, input_vars)
|
156 |
+
else:
|
157 |
+
output = llm_callback(rendered_prompt, input_vars)
|
158 |
+
|
159 |
+
# Compute metrics
|
160 |
+
metrics_results = {}
|
161 |
+
for metric in metrics_to_use:
|
162 |
+
metrics_results[metric.name] = metric.compute(output, expected, **input_vars)
|
163 |
+
|
164 |
+
results.append({
|
165 |
+
"input": input_vars,
|
166 |
+
"output": output,
|
167 |
+
"expected": expected,
|
168 |
+
"metrics": metrics_results
|
169 |
+
})
|
170 |
+
|
171 |
+
# Aggregate metrics
|
172 |
+
aggregated_metrics = {}
|
173 |
+
for metric in metrics_to_use:
|
174 |
+
values = [r["metrics"][metric.name] for r in results]
|
175 |
+
aggregated_metrics[metric.name] = sum(values) / len(values) if values else 0
|
176 |
+
|
177 |
+
evaluation_result = {
|
178 |
+
"prompt_id": prompt_id,
|
179 |
+
"prompt_version": prompt.version,
|
180 |
+
"num_samples": len(inputs),
|
181 |
+
"aggregated_metrics": aggregated_metrics,
|
182 |
+
"individual_results": results
|
183 |
+
}
|
184 |
+
|
185 |
+
# Save evaluation result
|
186 |
+
timestamp = datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-")
|
187 |
+
file_path = os.path.join(self.storage_path, f"eval_{prompt_id}_{timestamp}.json")
|
188 |
+
with open(file_path, "w") as f:
|
189 |
+
json.dump(evaluation_result, f, indent=2)
|
190 |
+
|
191 |
+
return evaluation_result
|
promptlab/core/prompt_manager.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import hashlib
|
4 |
+
import datetime
|
5 |
+
from typing import Dict, List, Optional, Union, Any
|
6 |
+
|
7 |
+
class Prompt:
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
content: str,
|
11 |
+
name: str,
|
12 |
+
description: Optional[str] = None,
|
13 |
+
tags: Optional[List[str]] = None,
|
14 |
+
metadata: Optional[Dict[str, Any]] = None
|
15 |
+
):
|
16 |
+
self.content = content
|
17 |
+
self.name = name
|
18 |
+
self.description = description or ""
|
19 |
+
self.tags = tags or []
|
20 |
+
self.metadata = metadata or {}
|
21 |
+
self.created_at = datetime.datetime.now().isoformat()
|
22 |
+
self.updated_at = self.created_at
|
23 |
+
self.id = self._generate_id()
|
24 |
+
self.version = 1
|
25 |
+
|
26 |
+
def _generate_id(self) -> str:
|
27 |
+
"""Generate a unique ID based on content and name."""
|
28 |
+
unique_string = f"{self.name}:{self.content}:{self.created_at}"
|
29 |
+
return hashlib.md5(unique_string.encode()).hexdigest()[:10]
|
30 |
+
|
31 |
+
def update(self, content: Optional[str] = None, **kwargs) -> None:
|
32 |
+
"""Update prompt attributes."""
|
33 |
+
if content is not None:
|
34 |
+
self.content = content
|
35 |
+
|
36 |
+
for key, value in kwargs.items():
|
37 |
+
if hasattr(self, key):
|
38 |
+
setattr(self, key, value)
|
39 |
+
|
40 |
+
self.updated_at = datetime.datetime.now().isoformat()
|
41 |
+
|
42 |
+
def to_dict(self) -> Dict[str, Any]:
|
43 |
+
"""Convert prompt to dictionary."""
|
44 |
+
return {
|
45 |
+
"id": self.id,
|
46 |
+
"name": self.name,
|
47 |
+
"content": self.content,
|
48 |
+
"description": self.description,
|
49 |
+
"tags": self.tags,
|
50 |
+
"metadata": self.metadata,
|
51 |
+
"created_at": self.created_at,
|
52 |
+
"updated_at": self.updated_at,
|
53 |
+
"version": self.version
|
54 |
+
}
|
55 |
+
|
56 |
+
@classmethod
|
57 |
+
def from_dict(cls, data: Dict[str, Any]) -> "Prompt":
|
58 |
+
"""Create prompt from dictionary."""
|
59 |
+
prompt = cls(
|
60 |
+
content=data["content"],
|
61 |
+
name=data["name"],
|
62 |
+
description=data.get("description", ""),
|
63 |
+
tags=data.get("tags", []),
|
64 |
+
metadata=data.get("metadata", {})
|
65 |
+
)
|
66 |
+
prompt.id = data["id"]
|
67 |
+
prompt.created_at = data["created_at"]
|
68 |
+
prompt.updated_at = data["updated_at"]
|
69 |
+
prompt.version = data["version"]
|
70 |
+
return prompt
|
71 |
+
|
72 |
+
def render(self, **kwargs) -> str:
|
73 |
+
"""Render prompt with provided variables."""
|
74 |
+
rendered = self.content
|
75 |
+
for key, value in kwargs.items():
|
76 |
+
placeholder = f"{{{key}}}"
|
77 |
+
rendered = rendered.replace(placeholder, str(value))
|
78 |
+
return rendered
|
79 |
+
|
80 |
+
|
81 |
+
class PromptManager:
|
82 |
+
def __init__(self, storage_path: Optional[str] = None):
|
83 |
+
self.storage_path = storage_path or os.path.join(os.getcwd(), "promptlab_storage")
|
84 |
+
self.prompts: Dict[str, Prompt] = {}
|
85 |
+
self._ensure_storage_dir()
|
86 |
+
self._load_prompts()
|
87 |
+
|
88 |
+
def _ensure_storage_dir(self) -> None:
|
89 |
+
"""Ensure storage directory exists."""
|
90 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
91 |
+
|
92 |
+
def _load_prompts(self) -> None:
|
93 |
+
"""Load prompts from storage."""
|
94 |
+
prompts_dir = os.path.join(self.storage_path, "prompts")
|
95 |
+
if not os.path.exists(prompts_dir):
|
96 |
+
os.makedirs(prompts_dir)
|
97 |
+
return
|
98 |
+
|
99 |
+
for filename in os.listdir(prompts_dir):
|
100 |
+
if filename.endswith(".json"):
|
101 |
+
with open(os.path.join(prompts_dir, filename), "r") as f:
|
102 |
+
prompt_data = json.load(f)
|
103 |
+
prompt = Prompt.from_dict(prompt_data)
|
104 |
+
self.prompts[prompt.id] = prompt
|
105 |
+
|
106 |
+
def _save_prompt(self, prompt: Prompt) -> None:
|
107 |
+
"""Save prompt to storage."""
|
108 |
+
prompts_dir = os.path.join(self.storage_path, "prompts")
|
109 |
+
os.makedirs(prompts_dir, exist_ok=True)
|
110 |
+
|
111 |
+
prompt_path = os.path.join(prompts_dir, f"{prompt.id}.json")
|
112 |
+
with open(prompt_path, "w") as f:
|
113 |
+
json.dump(prompt.to_dict(), f, indent=2)
|
114 |
+
|
115 |
+
def create(
|
116 |
+
self,
|
117 |
+
content: str,
|
118 |
+
name: str,
|
119 |
+
description: Optional[str] = None,
|
120 |
+
tags: Optional[List[str]] = None,
|
121 |
+
metadata: Optional[Dict[str, Any]] = None
|
122 |
+
) -> Prompt:
|
123 |
+
"""Create a new prompt."""
|
124 |
+
prompt = Prompt(
|
125 |
+
content=content,
|
126 |
+
name=name,
|
127 |
+
description=description,
|
128 |
+
tags=tags,
|
129 |
+
metadata=metadata
|
130 |
+
)
|
131 |
+
self.prompts[prompt.id] = prompt
|
132 |
+
self._save_prompt(prompt)
|
133 |
+
return prompt
|
134 |
+
|
135 |
+
def get(self, prompt_id: str) -> Optional[Prompt]:
|
136 |
+
"""Get prompt by ID."""
|
137 |
+
return self.prompts.get(prompt_id)
|
138 |
+
|
139 |
+
def update(self, prompt_id: str, **kwargs) -> Optional[Prompt]:
|
140 |
+
"""Update prompt by ID."""
|
141 |
+
prompt = self.get(prompt_id)
|
142 |
+
if prompt:
|
143 |
+
prompt.update(**kwargs)
|
144 |
+
self._save_prompt(prompt)
|
145 |
+
return prompt
|
146 |
+
|
147 |
+
def delete(self, prompt_id: str) -> bool:
|
148 |
+
"""Delete prompt by ID."""
|
149 |
+
if prompt_id in self.prompts:
|
150 |
+
del self.prompts[prompt_id]
|
151 |
+
prompt_path = os.path.join(self.storage_path, "prompts", f"{prompt_id}.json")
|
152 |
+
if os.path.exists(prompt_path):
|
153 |
+
os.remove(prompt_path)
|
154 |
+
return True
|
155 |
+
return False
|
156 |
+
|
157 |
+
def list(self, tags: Optional[List[str]] = None) -> List[Prompt]:
|
158 |
+
"""List prompts, optionally filtered by tags."""
|
159 |
+
if tags:
|
160 |
+
return [p for p in self.prompts.values() if any(tag in p.tags for tag in tags)]
|
161 |
+
return list(self.prompts.values())
|
162 |
+
|
163 |
+
def search(self, query: str) -> List[Prompt]:
|
164 |
+
"""Search prompts by name or content."""
|
165 |
+
query = query.lower()
|
166 |
+
return [
|
167 |
+
p for p in self.prompts.values()
|
168 |
+
if query in p.name.lower() or query in p.content.lower()
|
169 |
+
]
|
promptlab/core/testing.py
ADDED
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import uuid
|
4 |
+
import datetime
|
5 |
+
import asyncio
|
6 |
+
from typing import Dict, List, Optional, Any, Callable, Union, Awaitable, Tuple
|
7 |
+
from .prompt_manager import Prompt, PromptManager
|
8 |
+
|
9 |
+
class TestCase:
|
10 |
+
"""Represents a test case for a prompt."""
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
prompt_id: str,
|
14 |
+
input_vars: Dict[str, Any],
|
15 |
+
expected_output: Optional[str] = None,
|
16 |
+
name: Optional[str] = None,
|
17 |
+
description: Optional[str] = None
|
18 |
+
):
|
19 |
+
self.id = str(uuid.uuid4())[:10]
|
20 |
+
self.prompt_id = prompt_id
|
21 |
+
self.input_vars = input_vars
|
22 |
+
self.expected_output = expected_output
|
23 |
+
self.name = name or f"Test case {self.id}"
|
24 |
+
self.description = description or ""
|
25 |
+
self.created_at = datetime.datetime.now().isoformat()
|
26 |
+
|
27 |
+
def to_dict(self) -> Dict[str, Any]:
|
28 |
+
"""Convert test case to dictionary."""
|
29 |
+
return {
|
30 |
+
"id": self.id,
|
31 |
+
"prompt_id": self.prompt_id,
|
32 |
+
"input_vars": self.input_vars,
|
33 |
+
"expected_output": self.expected_output,
|
34 |
+
"name": self.name,
|
35 |
+
"description": self.description,
|
36 |
+
"created_at": self.created_at
|
37 |
+
}
|
38 |
+
|
39 |
+
@classmethod
|
40 |
+
def from_dict(cls, data: Dict[str, Any]) -> "TestCase":
|
41 |
+
"""Create test case from dictionary."""
|
42 |
+
test_case = cls(
|
43 |
+
prompt_id=data["prompt_id"],
|
44 |
+
input_vars=data["input_vars"],
|
45 |
+
expected_output=data.get("expected_output"),
|
46 |
+
name=data.get("name"),
|
47 |
+
description=data.get("description")
|
48 |
+
)
|
49 |
+
test_case.id = data["id"]
|
50 |
+
test_case.created_at = data["created_at"]
|
51 |
+
return test_case
|
52 |
+
|
53 |
+
|
54 |
+
class TestResult:
|
55 |
+
"""Represents the result of a test case execution."""
|
56 |
+
def __init__(
|
57 |
+
self,
|
58 |
+
test_case_id: str,
|
59 |
+
prompt_id: str,
|
60 |
+
prompt_version: int,
|
61 |
+
output: str,
|
62 |
+
passed: Optional[bool] = None,
|
63 |
+
metrics: Optional[Dict[str, float]] = None
|
64 |
+
):
|
65 |
+
self.id = str(uuid.uuid4())[:10]
|
66 |
+
self.test_case_id = test_case_id
|
67 |
+
self.prompt_id = prompt_id
|
68 |
+
self.prompt_version = prompt_version
|
69 |
+
self.output = output
|
70 |
+
self.passed = passed
|
71 |
+
self.metrics = metrics or {}
|
72 |
+
self.created_at = datetime.datetime.now().isoformat()
|
73 |
+
|
74 |
+
def to_dict(self) -> Dict[str, Any]:
|
75 |
+
"""Convert test result to dictionary."""
|
76 |
+
return {
|
77 |
+
"id": self.id,
|
78 |
+
"test_case_id": self.test_case_id,
|
79 |
+
"prompt_id": self.prompt_id,
|
80 |
+
"prompt_version": self.prompt_version,
|
81 |
+
"output": self.output,
|
82 |
+
"passed": self.passed,
|
83 |
+
"metrics": self.metrics,
|
84 |
+
"created_at": self.created_at
|
85 |
+
}
|
86 |
+
|
87 |
+
@classmethod
|
88 |
+
def from_dict(cls, data: Dict[str, Any]) -> "TestResult":
|
89 |
+
"""Create test result from dictionary."""
|
90 |
+
return cls(
|
91 |
+
test_case_id=data["test_case_id"],
|
92 |
+
prompt_id=data["prompt_id"],
|
93 |
+
prompt_version=data["prompt_version"],
|
94 |
+
output=data["output"],
|
95 |
+
passed=data.get("passed"),
|
96 |
+
metrics=data.get("metrics", {})
|
97 |
+
)
|
98 |
+
|
99 |
+
|
100 |
+
class ABTestResult:
|
101 |
+
"""Represents the result of an A/B test."""
|
102 |
+
def __init__(
|
103 |
+
self,
|
104 |
+
prompt_a_id: str,
|
105 |
+
prompt_b_id: str,
|
106 |
+
prompt_a_version: int,
|
107 |
+
prompt_b_version: int,
|
108 |
+
metrics_a: Dict[str, float],
|
109 |
+
metrics_b: Dict[str, float],
|
110 |
+
winner: Optional[str] = None
|
111 |
+
):
|
112 |
+
self.id = str(uuid.uuid4())[:10]
|
113 |
+
self.prompt_a_id = prompt_a_id
|
114 |
+
self.prompt_b_id = prompt_b_id
|
115 |
+
self.prompt_a_version = prompt_a_version
|
116 |
+
self.prompt_b_version = prompt_b_version
|
117 |
+
self.metrics_a = metrics_a
|
118 |
+
self.metrics_b = metrics_b
|
119 |
+
self.winner = winner
|
120 |
+
self.created_at = datetime.datetime.now().isoformat()
|
121 |
+
|
122 |
+
def to_dict(self) -> Dict[str, Any]:
|
123 |
+
"""Convert A/B test result to dictionary."""
|
124 |
+
return {
|
125 |
+
"id": self.id,
|
126 |
+
"prompt_a_id": self.prompt_a_id,
|
127 |
+
"prompt_b_id": self.prompt_b_id,
|
128 |
+
"prompt_a_version": self.prompt_a_version,
|
129 |
+
"prompt_b_version": self.prompt_b_version,
|
130 |
+
"metrics_a": self.metrics_a,
|
131 |
+
"metrics_b": self.metrics_b,
|
132 |
+
"winner": self.winner,
|
133 |
+
"created_at": self.created_at
|
134 |
+
}
|
135 |
+
|
136 |
+
@classmethod
|
137 |
+
def from_dict(cls, data: Dict[str, Any]) -> "ABTestResult":
|
138 |
+
"""Create A/B test result from dictionary."""
|
139 |
+
return cls(
|
140 |
+
prompt_a_id=data["prompt_a_id"],
|
141 |
+
prompt_b_id=data["prompt_b_id"],
|
142 |
+
prompt_a_version=data["prompt_a_version"],
|
143 |
+
prompt_b_version=data["prompt_b_version"],
|
144 |
+
metrics_a=data["metrics_a"],
|
145 |
+
metrics_b=data["metrics_b"],
|
146 |
+
winner=data.get("winner")
|
147 |
+
)
|
148 |
+
|
149 |
+
|
150 |
+
class PromptTesting:
|
151 |
+
"""Manages testing for prompts."""
|
152 |
+
def __init__(self, prompt_manager: PromptManager):
|
153 |
+
self.prompt_manager = prompt_manager
|
154 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "tests")
|
155 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
156 |
+
|
157 |
+
# Storage paths
|
158 |
+
self.test_cases_path = os.path.join(self.storage_path, "test_cases")
|
159 |
+
self.test_results_path = os.path.join(self.storage_path, "test_results")
|
160 |
+
self.ab_test_results_path = os.path.join(self.storage_path, "ab_test_results")
|
161 |
+
|
162 |
+
os.makedirs(self.test_cases_path, exist_ok=True)
|
163 |
+
os.makedirs(self.test_results_path, exist_ok=True)
|
164 |
+
os.makedirs(self.ab_test_results_path, exist_ok=True)
|
165 |
+
|
166 |
+
self.test_cases: Dict[str, TestCase] = {}
|
167 |
+
self.test_results: Dict[str, TestResult] = {}
|
168 |
+
self.ab_test_results: Dict[str, ABTestResult] = {}
|
169 |
+
|
170 |
+
self._load_test_cases()
|
171 |
+
self._load_test_results()
|
172 |
+
self._load_ab_test_results()
|
173 |
+
|
174 |
+
def _load_test_cases(self) -> None:
|
175 |
+
"""Load test cases from storage."""
|
176 |
+
for filename in os.listdir(self.test_cases_path):
|
177 |
+
if filename.endswith(".json"):
|
178 |
+
with open(os.path.join(self.test_cases_path, filename), "r") as f:
|
179 |
+
data = json.load(f)
|
180 |
+
test_case = TestCase.from_dict(data)
|
181 |
+
self.test_cases[test_case.id] = test_case
|
182 |
+
|
183 |
+
def _load_test_results(self) -> None:
|
184 |
+
"""Load test results from storage."""
|
185 |
+
for filename in os.listdir(self.test_results_path):
|
186 |
+
if filename.endswith(".json"):
|
187 |
+
with open(os.path.join(self.test_results_path, filename), "r") as f:
|
188 |
+
data = json.load(f)
|
189 |
+
test_result = TestResult.from_dict(data)
|
190 |
+
self.test_results[test_result.id] = test_result
|
191 |
+
|
192 |
+
def _load_ab_test_results(self) -> None:
|
193 |
+
"""Load A/B test results from storage."""
|
194 |
+
for filename in os.listdir(self.ab_test_results_path):
|
195 |
+
if filename.endswith(".json"):
|
196 |
+
with open(os.path.join(self.ab_test_results_path, filename), "r") as f:
|
197 |
+
data = json.load(f)
|
198 |
+
ab_test_result = ABTestResult.from_dict(data)
|
199 |
+
self.ab_test_results[ab_test_result.id] = ab_test_result
|
200 |
+
|
201 |
+
def _save_test_case(self, test_case: TestCase) -> None:
|
202 |
+
"""Save test case to storage."""
|
203 |
+
file_path = os.path.join(self.test_cases_path, f"{test_case.id}.json")
|
204 |
+
with open(file_path, "w") as f:
|
205 |
+
json.dump(test_case.to_dict(), f, indent=2)
|
206 |
+
|
207 |
+
def _save_test_result(self, test_result: TestResult) -> None:
|
208 |
+
"""Save test result to storage."""
|
209 |
+
file_path = os.path.join(self.test_results_path, f"{test_result.id}.json")
|
210 |
+
with open(file_path, "w") as f:
|
211 |
+
json.dump(test_result.to_dict(), f, indent=2)
|
212 |
+
|
213 |
+
def _save_ab_test_result(self, ab_test_result: ABTestResult) -> None:
|
214 |
+
"""Save A/B test result to storage."""
|
215 |
+
file_path = os.path.join(self.ab_test_results_path, f"{ab_test_result.id}.json")
|
216 |
+
with open(file_path, "w") as f:
|
217 |
+
json.dump(ab_test_result.to_dict(), f, indent=2)
|
218 |
+
|
219 |
+
def create_test_case(
|
220 |
+
self,
|
221 |
+
prompt_id: str,
|
222 |
+
input_vars: Dict[str, Any],
|
223 |
+
expected_output: Optional[str] = None,
|
224 |
+
name: Optional[str] = None,
|
225 |
+
description: Optional[str] = None
|
226 |
+
) -> TestCase:
|
227 |
+
"""Create a test case for a prompt."""
|
228 |
+
test_case = TestCase(
|
229 |
+
prompt_id=prompt_id,
|
230 |
+
input_vars=input_vars,
|
231 |
+
expected_output=expected_output,
|
232 |
+
name=name,
|
233 |
+
description=description
|
234 |
+
)
|
235 |
+
self.test_cases[test_case.id] = test_case
|
236 |
+
self._save_test_case(test_case)
|
237 |
+
return test_case
|
238 |
+
|
239 |
+
def get_test_case(self, test_case_id: str) -> Optional[TestCase]:
|
240 |
+
"""Get a test case by ID."""
|
241 |
+
return self.test_cases.get(test_case_id)
|
242 |
+
|
243 |
+
def list_test_cases(self, prompt_id: Optional[str] = None) -> List[TestCase]:
|
244 |
+
"""List test cases, optionally filtered by prompt ID."""
|
245 |
+
if prompt_id:
|
246 |
+
return [tc for tc in self.test_cases.values() if tc.prompt_id == prompt_id]
|
247 |
+
return list(self.test_cases.values())
|
248 |
+
|
249 |
+
def delete_test_case(self, test_case_id: str) -> bool:
|
250 |
+
"""Delete a test case by ID."""
|
251 |
+
if test_case_id in self.test_cases:
|
252 |
+
del self.test_cases[test_case_id]
|
253 |
+
file_path = os.path.join(self.test_cases_path, f"{test_case_id}.json")
|
254 |
+
if os.path.exists(file_path):
|
255 |
+
os.remove(file_path)
|
256 |
+
return True
|
257 |
+
return False
|
258 |
+
|
259 |
+
async def run_test_case(
|
260 |
+
self,
|
261 |
+
test_case_id: str,
|
262 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
263 |
+
metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
|
264 |
+
) -> TestResult:
|
265 |
+
"""Run a test case with the given LLM callback."""
|
266 |
+
test_case = self.get_test_case(test_case_id)
|
267 |
+
if not test_case:
|
268 |
+
raise ValueError(f"Test case with ID {test_case_id} not found")
|
269 |
+
|
270 |
+
prompt = self.prompt_manager.get(test_case.prompt_id)
|
271 |
+
if not prompt:
|
272 |
+
raise ValueError(f"Prompt with ID {test_case.prompt_id} not found")
|
273 |
+
|
274 |
+
# Render the prompt with the input variables
|
275 |
+
rendered_prompt = prompt.render(**test_case.input_vars)
|
276 |
+
|
277 |
+
# Call the LLM with the rendered prompt
|
278 |
+
if asyncio.iscoroutinefunction(llm_callback):
|
279 |
+
output = await llm_callback(rendered_prompt, test_case.input_vars)
|
280 |
+
else:
|
281 |
+
output = llm_callback(rendered_prompt, test_case.input_vars)
|
282 |
+
|
283 |
+
# Determine if the test passed
|
284 |
+
passed = None
|
285 |
+
if test_case.expected_output:
|
286 |
+
passed = output.strip() == test_case.expected_output.strip()
|
287 |
+
|
288 |
+
# Calculate metrics if callbacks are provided
|
289 |
+
metrics = {}
|
290 |
+
if metrics_callbacks:
|
291 |
+
for metric_callback in metrics_callbacks:
|
292 |
+
metrics.update(metric_callback(output, test_case.expected_output or ""))
|
293 |
+
|
294 |
+
# Create and save the test result
|
295 |
+
test_result = TestResult(
|
296 |
+
test_case_id=test_case.id,
|
297 |
+
prompt_id=test_case.prompt_id,
|
298 |
+
prompt_version=prompt.version,
|
299 |
+
output=output,
|
300 |
+
passed=passed,
|
301 |
+
metrics=metrics
|
302 |
+
)
|
303 |
+
self.test_results[test_result.id] = test_result
|
304 |
+
self._save_test_result(test_result)
|
305 |
+
|
306 |
+
return test_result
|
307 |
+
|
308 |
+
async def run_test_cases(
|
309 |
+
self,
|
310 |
+
prompt_id: str,
|
311 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
312 |
+
metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
|
313 |
+
) -> List[TestResult]:
|
314 |
+
"""Run all test cases for a prompt."""
|
315 |
+
test_cases = self.list_test_cases(prompt_id)
|
316 |
+
results = []
|
317 |
+
|
318 |
+
for test_case in test_cases:
|
319 |
+
result = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
|
320 |
+
results.append(result)
|
321 |
+
|
322 |
+
return results
|
323 |
+
|
324 |
+
async def run_ab_test(
|
325 |
+
self,
|
326 |
+
prompt_a_id: str,
|
327 |
+
prompt_b_id: str,
|
328 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
329 |
+
metrics_callbacks: List[Callable[[str, str], Dict[str, float]]],
|
330 |
+
test_cases: Optional[List[str]] = None
|
331 |
+
) -> ABTestResult:
|
332 |
+
"""Run an A/B test with two prompts."""
|
333 |
+
prompt_a = self.prompt_manager.get(prompt_a_id)
|
334 |
+
prompt_b = self.prompt_manager.get(prompt_b_id)
|
335 |
+
|
336 |
+
if not prompt_a or not prompt_b:
|
337 |
+
raise ValueError("Both prompts must exist")
|
338 |
+
|
339 |
+
# Get test cases to use
|
340 |
+
if test_cases:
|
341 |
+
# Use specified test cases
|
342 |
+
test_case_objs = [self.get_test_case(tc_id) for tc_id in test_cases]
|
343 |
+
test_case_objs = [tc for tc in test_case_objs if tc]
|
344 |
+
else:
|
345 |
+
# Use all test cases for prompt A
|
346 |
+
test_case_objs = self.list_test_cases(prompt_a_id)
|
347 |
+
|
348 |
+
if not test_case_objs:
|
349 |
+
raise ValueError("No test cases found for the A/B test")
|
350 |
+
|
351 |
+
# Run test cases for both prompts
|
352 |
+
results_a = []
|
353 |
+
results_b = []
|
354 |
+
|
355 |
+
for test_case in test_case_objs:
|
356 |
+
# Create a copy of the test case for prompt B
|
357 |
+
if test_case.prompt_id != prompt_b_id:
|
358 |
+
test_case_b = self.create_test_case(
|
359 |
+
prompt_id=prompt_b_id,
|
360 |
+
input_vars=test_case.input_vars,
|
361 |
+
expected_output=test_case.expected_output,
|
362 |
+
name=f"Copy of {test_case.name} for B",
|
363 |
+
description=test_case.description
|
364 |
+
)
|
365 |
+
else:
|
366 |
+
test_case_b = test_case
|
367 |
+
|
368 |
+
# Run the test cases
|
369 |
+
result_a = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
|
370 |
+
result_b = await self.run_test_case(test_case_b.id, llm_callback, metrics_callbacks)
|
371 |
+
|
372 |
+
results_a.append(result_a)
|
373 |
+
results_b.append(result_b)
|
374 |
+
|
375 |
+
# Calculate aggregate metrics
|
376 |
+
metrics_a = self._aggregate_metrics([r.metrics for r in results_a])
|
377 |
+
metrics_b = self._aggregate_metrics([r.metrics for r in results_b])
|
378 |
+
|
379 |
+
# Determine winner
|
380 |
+
winner = self._determine_winner(metrics_a, metrics_b)
|
381 |
+
|
382 |
+
# Create and save the A/B test result
|
383 |
+
ab_test_result = ABTestResult(
|
384 |
+
prompt_a_id=prompt_a_id,
|
385 |
+
prompt_b_id=prompt_b_id,
|
386 |
+
prompt_a_version=prompt_a.version,
|
387 |
+
prompt_b_version=prompt_b.version,
|
388 |
+
metrics_a=metrics_a,
|
389 |
+
metrics_b=metrics_b,
|
390 |
+
winner=winner
|
391 |
+
)
|
392 |
+
self.ab_test_results[ab_test_result.id] = ab_test_result
|
393 |
+
self._save_ab_test_result(ab_test_result)
|
394 |
+
|
395 |
+
return ab_test_result
|
396 |
+
|
397 |
+
def _aggregate_metrics(self, metrics_list: List[Dict[str, float]]) -> Dict[str, float]:
|
398 |
+
"""Aggregate metrics from multiple test results."""
|
399 |
+
if not metrics_list:
|
400 |
+
return {}
|
401 |
+
|
402 |
+
aggregated = {}
|
403 |
+
for key in metrics_list[0].keys():
|
404 |
+
values = [m.get(key, 0) for m in metrics_list]
|
405 |
+
aggregated[key] = sum(values) / len(values) # Simple average
|
406 |
+
|
407 |
+
return aggregated
|
408 |
+
|
409 |
+
def _determine_winner(self, metrics_a: Dict[str, float], metrics_b: Dict[str, float]) -> Optional[str]:
|
410 |
+
"""Determine winner of A/B test based on metrics."""
|
411 |
+
if not metrics_a or not metrics_b:
|
412 |
+
return None
|
413 |
+
|
414 |
+
# Assume higher values are better for all metrics
|
415 |
+
a_wins = 0
|
416 |
+
b_wins = 0
|
417 |
+
|
418 |
+
for key in metrics_a.keys():
|
419 |
+
if key in metrics_b:
|
420 |
+
if metrics_a[key] > metrics_b[key]:
|
421 |
+
a_wins += 1
|
422 |
+
elif metrics_b[key] > metrics_a[key]:
|
423 |
+
b_wins += 1
|
424 |
+
|
425 |
+
if a_wins > b_wins:
|
426 |
+
return "A"
|
427 |
+
elif b_wins > a_wins:
|
428 |
+
return "B"
|
429 |
+
else:
|
430 |
+
return None # Tie
|
431 |
+
|
432 |
+
def get_test_results(self, test_case_id: Optional[str] = None, prompt_id: Optional[str] = None) -> List[TestResult]:
|
433 |
+
"""Get test results, optionally filtered by test case ID or prompt ID."""
|
434 |
+
results = list(self.test_results.values())
|
435 |
+
|
436 |
+
if test_case_id:
|
437 |
+
results = [r for r in results if r.test_case_id == test_case_id]
|
438 |
+
|
439 |
+
if prompt_id:
|
440 |
+
results = [r for r in results if r.prompt_id == prompt_id]
|
441 |
+
|
442 |
+
return sorted(results, key=lambda r: r.created_at, reverse=True)
|
443 |
+
|
444 |
+
def get_ab_test_results(self, prompt_id: Optional[str] = None) -> List[ABTestResult]:
|
445 |
+
"""Get A/B test results, optionally filtered by prompt ID."""
|
446 |
+
results = list(self.ab_test_results.values())
|
447 |
+
|
448 |
+
if prompt_id:
|
449 |
+
results = [r for r in results if r.prompt_a_id == prompt_id or r.prompt_b_id == prompt_id]
|
450 |
+
|
451 |
+
return sorted(results, key=lambda r: r.created_at, reverse=True)
|
promptlab/core/version_control.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import datetime
|
4 |
+
from typing import Dict, List, Optional, Any
|
5 |
+
from .prompt_manager import Prompt, PromptManager
|
6 |
+
|
7 |
+
class PromptVersion:
|
8 |
+
"""Represents a specific version of a prompt."""
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
prompt_id: str,
|
12 |
+
version: int,
|
13 |
+
content: str,
|
14 |
+
metadata: Optional[Dict[str, Any]] = None,
|
15 |
+
commit_message: Optional[str] = None
|
16 |
+
):
|
17 |
+
self.prompt_id = prompt_id
|
18 |
+
self.version = version
|
19 |
+
self.content = content
|
20 |
+
self.metadata = metadata or {}
|
21 |
+
self.commit_message = commit_message or ""
|
22 |
+
self.created_at = datetime.datetime.now().isoformat()
|
23 |
+
|
24 |
+
def to_dict(self) -> Dict[str, Any]:
|
25 |
+
"""Convert version to dictionary."""
|
26 |
+
return {
|
27 |
+
"prompt_id": self.prompt_id,
|
28 |
+
"version": self.version,
|
29 |
+
"content": self.content,
|
30 |
+
"metadata": self.metadata,
|
31 |
+
"commit_message": self.commit_message,
|
32 |
+
"created_at": self.created_at
|
33 |
+
}
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion":
|
37 |
+
"""Create version from dictionary."""
|
38 |
+
return cls(
|
39 |
+
prompt_id=data["prompt_id"],
|
40 |
+
version=data["version"],
|
41 |
+
content=data["content"],
|
42 |
+
metadata=data.get("metadata", {}),
|
43 |
+
commit_message=data.get("commit_message", "")
|
44 |
+
)
|
45 |
+
|
46 |
+
|
47 |
+
class VersionControl:
|
48 |
+
"""Manages versioning for prompts."""
|
49 |
+
def __init__(self, prompt_manager: PromptManager):
|
50 |
+
self.prompt_manager = prompt_manager
|
51 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "versions")
|
52 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
53 |
+
self.versions: Dict[str, Dict[int, PromptVersion]] = {}
|
54 |
+
self._load_versions()
|
55 |
+
|
56 |
+
def _load_versions(self) -> None:
|
57 |
+
"""Load versions from storage."""
|
58 |
+
if not os.path.exists(self.storage_path):
|
59 |
+
os.makedirs(self.storage_path)
|
60 |
+
return
|
61 |
+
|
62 |
+
for prompt_id_dir in os.listdir(self.storage_path):
|
63 |
+
prompt_dir = os.path.join(self.storage_path, prompt_id_dir)
|
64 |
+
if os.path.isdir(prompt_dir):
|
65 |
+
self.versions[prompt_id_dir] = {}
|
66 |
+
|
67 |
+
for filename in os.listdir(prompt_dir):
|
68 |
+
if filename.endswith(".json"):
|
69 |
+
with open(os.path.join(prompt_dir, filename), "r") as f:
|
70 |
+
version_data = json.load(f)
|
71 |
+
version = PromptVersion.from_dict(version_data)
|
72 |
+
self.versions[prompt_id_dir][version.version] = version
|
73 |
+
|
74 |
+
def _save_version(self, version: PromptVersion) -> None:
|
75 |
+
"""Save version to storage."""
|
76 |
+
prompt_dir = os.path.join(self.storage_path, version.prompt_id)
|
77 |
+
os.makedirs(prompt_dir, exist_ok=True)
|
78 |
+
|
79 |
+
version_path = os.path.join(prompt_dir, f"v{version.version}.json")
|
80 |
+
with open(version_path, "w") as f:
|
81 |
+
json.dump(version.to_dict(), f, indent=2)
|
82 |
+
|
83 |
+
def commit(
|
84 |
+
self,
|
85 |
+
prompt_id: str,
|
86 |
+
commit_message: Optional[str] = None,
|
87 |
+
metadata: Optional[Dict[str, Any]] = None
|
88 |
+
) -> Optional[PromptVersion]:
|
89 |
+
"""Create a new version of a prompt."""
|
90 |
+
prompt = self.prompt_manager.get(prompt_id)
|
91 |
+
if not prompt:
|
92 |
+
return None
|
93 |
+
|
94 |
+
# Initialize versions dict for this prompt if it doesn't exist
|
95 |
+
if prompt_id not in self.versions:
|
96 |
+
self.versions[prompt_id] = {}
|
97 |
+
|
98 |
+
# Get the highest version number for this prompt
|
99 |
+
current_versions = self.versions.get(prompt_id, {})
|
100 |
+
next_version = max(current_versions.keys(), default=0) + 1
|
101 |
+
|
102 |
+
# Create the new version
|
103 |
+
version = PromptVersion(
|
104 |
+
prompt_id=prompt_id,
|
105 |
+
version=next_version,
|
106 |
+
content=prompt.content,
|
107 |
+
metadata=metadata or {},
|
108 |
+
commit_message=commit_message
|
109 |
+
)
|
110 |
+
|
111 |
+
# Save the new version
|
112 |
+
self.versions[prompt_id][next_version] = version
|
113 |
+
self._save_version(version)
|
114 |
+
|
115 |
+
# Update the prompt's version number
|
116 |
+
prompt.version = next_version
|
117 |
+
self.prompt_manager._save_prompt(prompt)
|
118 |
+
|
119 |
+
return version
|
120 |
+
|
121 |
+
def get_version(self, prompt_id: str, version: int) -> Optional[PromptVersion]:
|
122 |
+
"""Get a specific version of a prompt."""
|
123 |
+
return self.versions.get(prompt_id, {}).get(version)
|
124 |
+
|
125 |
+
def list_versions(self, prompt_id: str) -> List[PromptVersion]:
|
126 |
+
"""List all versions of a prompt."""
|
127 |
+
versions = self.versions.get(prompt_id, {})
|
128 |
+
return sorted(versions.values(), key=lambda v: v.version)
|
129 |
+
|
130 |
+
def checkout(self, prompt_id: str, version: int) -> Optional[Prompt]:
|
131 |
+
"""Checkout a specific version of a prompt."""
|
132 |
+
prompt = self.prompt_manager.get(prompt_id)
|
133 |
+
version_obj = self.get_version(prompt_id, version)
|
134 |
+
|
135 |
+
if not prompt or not version_obj:
|
136 |
+
return None
|
137 |
+
|
138 |
+
prompt.content = version_obj.content
|
139 |
+
prompt.version = version
|
140 |
+
prompt.updated_at = datetime.datetime.now().isoformat()
|
141 |
+
|
142 |
+
self.prompt_manager._save_prompt(prompt)
|
143 |
+
return prompt
|
144 |
+
|
145 |
+
def diff(self, prompt_id: str, version1: int, version2: int) -> Dict[str, Any]:
|
146 |
+
"""Compare two versions of a prompt."""
|
147 |
+
v1 = self.get_version(prompt_id, version1)
|
148 |
+
v2 = self.get_version(prompt_id, version2)
|
149 |
+
|
150 |
+
if not v1 or not v2:
|
151 |
+
return {}
|
152 |
+
|
153 |
+
import difflib
|
154 |
+
d = difflib.Differ()
|
155 |
+
diff = list(d.compare(v1.content.splitlines(), v2.content.splitlines()))
|
156 |
+
|
157 |
+
return {
|
158 |
+
"version1": version1,
|
159 |
+
"version2": version2,
|
160 |
+
"diff": diff
|
161 |
+
}
|
promptlab/examples/__init__.py
ADDED
File without changes
|
promptlab/examples/ab_testing.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A/B testing example for PromptLab.
|
3 |
+
|
4 |
+
This example demonstrates how to use PromptLab to perform A/B testing
|
5 |
+
on different prompt variations to find the most effective one.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import asyncio
|
9 |
+
import os
|
10 |
+
from promptlab import PromptManager, PromptTesting
|
11 |
+
|
12 |
+
async def llm_callback(prompt, vars):
|
13 |
+
"""
|
14 |
+
Simulated LLM callback for testing.
|
15 |
+
|
16 |
+
In a real scenario, this would call an actual LLM API.
|
17 |
+
"""
|
18 |
+
# Simple simulation - return different responses based on prompt content
|
19 |
+
if "concise" in prompt.lower():
|
20 |
+
return "This is a short, concise response."
|
21 |
+
elif "detailed" in prompt.lower():
|
22 |
+
return "This is a much more detailed response that provides additional context and information about the query. It elaborates on various aspects and provides a comprehensive answer."
|
23 |
+
else:
|
24 |
+
return "Default response."
|
25 |
+
|
26 |
+
async def main():
|
27 |
+
# Initialize the prompt manager with a custom storage path
|
28 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
29 |
+
prompt_manager = PromptManager(storage_path)
|
30 |
+
|
31 |
+
# Initialize testing
|
32 |
+
testing = PromptTesting(prompt_manager)
|
33 |
+
|
34 |
+
# Create two prompt variations
|
35 |
+
prompt_a = prompt_manager.create(
|
36 |
+
content="Provide a concise answer to the following question: {question}",
|
37 |
+
name="Concise Prompt",
|
38 |
+
description="A prompt that asks for concise answers",
|
39 |
+
tags=["concise", "test"]
|
40 |
+
)
|
41 |
+
|
42 |
+
prompt_b = prompt_manager.create(
|
43 |
+
content="Provide a detailed and comprehensive answer to the following question: {question}",
|
44 |
+
name="Detailed Prompt",
|
45 |
+
description="A prompt that asks for detailed answers",
|
46 |
+
tags=["detailed", "test"]
|
47 |
+
)
|
48 |
+
|
49 |
+
print(f"Created prompt A with ID: {prompt_a.id}")
|
50 |
+
print(f"Created prompt B with ID: {prompt_b.id}")
|
51 |
+
|
52 |
+
# Create test cases
|
53 |
+
test_cases = []
|
54 |
+
|
55 |
+
questions = [
|
56 |
+
"What is machine learning?",
|
57 |
+
"How does a neural network work?",
|
58 |
+
"What are the benefits of version control?"
|
59 |
+
]
|
60 |
+
|
61 |
+
for i, question in enumerate(questions):
|
62 |
+
test_case = testing.create_test_case(
|
63 |
+
prompt_id=prompt_a.id,
|
64 |
+
input_vars={"question": question},
|
65 |
+
name=f"Test Case {i+1}",
|
66 |
+
description=f"Test case for question: {question}"
|
67 |
+
)
|
68 |
+
test_cases.append(test_case.id)
|
69 |
+
|
70 |
+
print(f"Created {len(test_cases)} test cases")
|
71 |
+
|
72 |
+
# Define metrics callbacks
|
73 |
+
def length_metric(output, expected):
|
74 |
+
"""Measure output length as a metric."""
|
75 |
+
return {"length": len(output) / 1000} # Normalize to 0-1 range
|
76 |
+
|
77 |
+
def keyword_metric(output, expected):
|
78 |
+
"""Check for presence of keywords."""
|
79 |
+
keywords = ["machine", "learning", "neural", "network", "version", "control"]
|
80 |
+
matches = sum(1 for k in keywords if k.lower() in output.lower())
|
81 |
+
return {"keyword_matches": matches / len(keywords)}
|
82 |
+
|
83 |
+
# Run A/B test
|
84 |
+
ab_result = await testing.run_ab_test(
|
85 |
+
prompt_a_id=prompt_a.id,
|
86 |
+
prompt_b_id=prompt_b.id,
|
87 |
+
llm_callback=llm_callback,
|
88 |
+
metrics_callbacks=[length_metric, keyword_metric],
|
89 |
+
test_cases=test_cases
|
90 |
+
)
|
91 |
+
|
92 |
+
print(f"A/B test completed with ID: {ab_result.id}")
|
93 |
+
print(f"Prompt A metrics: {ab_result.metrics_a}")
|
94 |
+
print(f"Prompt B metrics: {ab_result.metrics_b}")
|
95 |
+
print(f"Winner: {ab_result.winner or 'Tie'}")
|
96 |
+
|
97 |
+
# List all test results
|
98 |
+
results_a = testing.get_test_results(prompt_id=prompt_a.id)
|
99 |
+
results_b = testing.get_test_results(prompt_id=prompt_b.id)
|
100 |
+
|
101 |
+
print(f"Found {len(results_a)} test results for prompt A")
|
102 |
+
print(f"Found {len(results_b)} test results for prompt B")
|
103 |
+
|
104 |
+
# Display individual test results
|
105 |
+
print("\nSample outputs:")
|
106 |
+
|
107 |
+
for i, (result_a, result_b) in enumerate(zip(results_a[:3], results_b[:3])):
|
108 |
+
print(f"\nTest Case {i+1}:")
|
109 |
+
|
110 |
+
print("\nConcise prompt output:")
|
111 |
+
print(result_a.output)
|
112 |
+
|
113 |
+
print("\nDetailed prompt output:")
|
114 |
+
print(result_b.output)
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
asyncio.run(main())
|
promptlab/examples/basic_usage.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
Basic usage example for PromptLab.
|
4 |
+
|
5 |
+
This example demonstrates the fundamental features of PromptLab
|
6 |
+
including creating prompts, versioning, and rendering.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import asyncio
|
10 |
+
import os
|
11 |
+
from promptlab import PromptManager, VersionControl
|
12 |
+
|
13 |
+
async def main():
|
14 |
+
# Initialize the prompt manager with a custom storage path
|
15 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
16 |
+
prompt_manager = PromptManager(storage_path)
|
17 |
+
|
18 |
+
# Initialize version control
|
19 |
+
version_control = VersionControl(prompt_manager)
|
20 |
+
|
21 |
+
# Create a basic prompt
|
22 |
+
basic_prompt = prompt_manager.create(
|
23 |
+
content="Hello, my name is {name} and I am a {occupation}.",
|
24 |
+
name="Introduction",
|
25 |
+
description="A simple introduction prompt",
|
26 |
+
tags=["basic", "introduction"]
|
27 |
+
)
|
28 |
+
|
29 |
+
print(f"Created prompt with ID: {basic_prompt.id}")
|
30 |
+
|
31 |
+
# Render the prompt with variables
|
32 |
+
rendered = basic_prompt.render(name="Alice", occupation="Data Scientist")
|
33 |
+
print(f"Rendered prompt: {rendered}")
|
34 |
+
|
35 |
+
# Create a more complex prompt
|
36 |
+
complex_prompt = prompt_manager.create(
|
37 |
+
content="""
|
38 |
+
System: {system_message}
|
39 |
+
|
40 |
+
User: {user_message}
|
41 |
+
|
42 |
+
Assistant:
|
43 |
+
""",
|
44 |
+
name="Chat Interaction",
|
45 |
+
description="A prompt for chat interactions",
|
46 |
+
tags=["chat", "interaction"]
|
47 |
+
)
|
48 |
+
|
49 |
+
print(f"Created complex prompt with ID: {complex_prompt.id}")
|
50 |
+
|
51 |
+
# Render the complex prompt
|
52 |
+
rendered = complex_prompt.render(
|
53 |
+
system_message="You are a helpful assistant.",
|
54 |
+
user_message="Can you help me with Python programming?"
|
55 |
+
)
|
56 |
+
print(f"Rendered complex prompt:\n{rendered}")
|
57 |
+
|
58 |
+
# Create a version
|
59 |
+
version = version_control.commit(
|
60 |
+
prompt_id=complex_prompt.id,
|
61 |
+
commit_message="Initial version"
|
62 |
+
)
|
63 |
+
|
64 |
+
print(f"Created version {version.version} for prompt {complex_prompt.id}")
|
65 |
+
|
66 |
+
# Update the prompt
|
67 |
+
complex_prompt = prompt_manager.update(
|
68 |
+
complex_prompt.id,
|
69 |
+
content="""
|
70 |
+
System: {system_message}
|
71 |
+
|
72 |
+
User: {user_message}
|
73 |
+
|
74 |
+
Think step by step:
|
75 |
+
{thinking}
|
76 |
+
|
77 |
+
Assistant:
|
78 |
+
"""
|
79 |
+
)
|
80 |
+
|
81 |
+
print(f"Updated prompt with ID: {complex_prompt.id}")
|
82 |
+
|
83 |
+
# Create another version
|
84 |
+
version = version_control.commit(
|
85 |
+
prompt_id=complex_prompt.id,
|
86 |
+
commit_message="Added thinking step"
|
87 |
+
)
|
88 |
+
|
89 |
+
print(f"Created version {version.version} for prompt {complex_prompt.id}")
|
90 |
+
|
91 |
+
# List all versions
|
92 |
+
versions = version_control.list_versions(complex_prompt.id)
|
93 |
+
print(f"Found {len(versions)} versions for prompt {complex_prompt.id}:")
|
94 |
+
for v in versions:
|
95 |
+
print(f"Version: {v.version} | Created: {v.created_at} | Message: {v.commit_message}")
|
96 |
+
|
97 |
+
# Checkout a specific version
|
98 |
+
prompt = version_control.checkout(complex_prompt.id, 1)
|
99 |
+
print(f"Checked out version 1 for prompt {complex_prompt.id}")
|
100 |
+
print(f"Content:\n{prompt.content}")
|
101 |
+
|
102 |
+
# List all prompts
|
103 |
+
prompts = prompt_manager.list()
|
104 |
+
print(f"Found {len(prompts)} prompts:")
|
105 |
+
for p in prompts:
|
106 |
+
print(f"ID: {p.id} | Name: {p.name} | Tags: {', '.join(p.tags)}")
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
asyncio.run(main())
|
promptlab/examples/evaluation_example.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Evaluation example for PromptLab.
|
3 |
+
|
4 |
+
This example demonstrates how to use PromptLab's evaluation framework
|
5 |
+
to measure the quality of prompts using various metrics.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import asyncio
|
9 |
+
import os
|
10 |
+
from promptlab import PromptManager, Evaluator, ContainsKeywordsMetric, LengthMetric
|
11 |
+
|
12 |
+
async def llm_callback(prompt, vars):
|
13 |
+
"""
|
14 |
+
Simulated LLM callback for testing.
|
15 |
+
|
16 |
+
In a real scenario, this would call an actual LLM API.
|
17 |
+
"""
|
18 |
+
# Simple simulation based on input text
|
19 |
+
text = vars.get("text", "")
|
20 |
+
|
21 |
+
if "code" in text.lower():
|
22 |
+
return "```python\ndef hello_world():\n print('Hello, world!')\n```"
|
23 |
+
elif "list" in text.lower():
|
24 |
+
return "1. First item\n2. Second item\n3. Third item"
|
25 |
+
elif "summary" in text.lower():
|
26 |
+
return f"This is a summary of the text about {text.split()[0]}."
|
27 |
+
else:
|
28 |
+
return f"Response to: {text}"
|
29 |
+
|
30 |
+
async def main():
|
31 |
+
# Initialize the prompt manager with a custom storage path
|
32 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
33 |
+
prompt_manager = PromptManager(storage_path)
|
34 |
+
|
35 |
+
# Initialize evaluator
|
36 |
+
evaluator = Evaluator(prompt_manager)
|
37 |
+
|
38 |
+
# Create a prompt for evaluation
|
39 |
+
prompt = prompt_manager.create(
|
40 |
+
content="Please {action} the following text: {text}",
|
41 |
+
name="Dynamic Action Prompt",
|
42 |
+
description="A prompt that can perform different actions based on input",
|
43 |
+
tags=["action", "dynamic"]
|
44 |
+
)
|
45 |
+
|
46 |
+
print(f"Created prompt with ID: {prompt.id}")
|
47 |
+
|
48 |
+
# Register custom metrics
|
49 |
+
code_keywords = ContainsKeywordsMetric(
|
50 |
+
keywords=["def", "print", "function", "return"],
|
51 |
+
case_sensitive=False
|
52 |
+
)
|
53 |
+
evaluator.register_metric(code_keywords)
|
54 |
+
|
55 |
+
list_keywords = ContainsKeywordsMetric(
|
56 |
+
keywords=["1.", "2.", "3.", "item"],
|
57 |
+
case_sensitive=False
|
58 |
+
)
|
59 |
+
evaluator.register_metric(list_keywords)
|
60 |
+
|
61 |
+
length_metric = LengthMetric(min_length=10, max_length=500)
|
62 |
+
evaluator.register_metric(length_metric)
|
63 |
+
|
64 |
+
# Create test inputs for different actions
|
65 |
+
test_inputs = [
|
66 |
+
{"action": "write code for", "text": "a simple hello world function"},
|
67 |
+
{"action": "create a list of", "text": "three important items"},
|
68 |
+
{"action": "summarize", "text": "machine learning concepts in data science"},
|
69 |
+
{"action": "analyze", "text": "the impact of climate change on ecosystems"}
|
70 |
+
]
|
71 |
+
|
72 |
+
# Run evaluation
|
73 |
+
evaluation_result = await evaluator.evaluate_prompt(
|
74 |
+
prompt_id=prompt.id,
|
75 |
+
inputs=test_inputs,
|
76 |
+
llm_callback=llm_callback
|
77 |
+
)
|
78 |
+
|
79 |
+
# Print evaluation results
|
80 |
+
print("\nEvaluation completed!")
|
81 |
+
print("\nAggregated metrics:")
|
82 |
+
for name, value in evaluation_result["aggregated_metrics"].items():
|
83 |
+
print(f"{name}: {value:.4f}")
|
84 |
+
|
85 |
+
print("\nIndividual results:")
|
86 |
+
for i, result in enumerate(evaluation_result["individual_results"]):
|
87 |
+
print(f"\nTest {i+1} ({result['input']['action']} {result['input']['text']}):")
|
88 |
+
print(f"Output: {result['output']}")
|
89 |
+
|
90 |
+
print("Metrics:")
|
91 |
+
for name, value in result["metrics"].items():
|
92 |
+
print(f" {name}: {value:.4f}")
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
asyncio.run(main())
|
promptlab/tests/__init__.py
ADDED
File without changes
|
promptlab/tests/test_evaluation.py
ADDED
File without changes
|
promptlab/tests/test_prompt_manager.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import tempfile
|
5 |
+
from promptlab.core.prompt_manager import PromptManager, Prompt
|
6 |
+
|
7 |
+
class TestPromptManager(unittest.TestCase):
|
8 |
+
def setUp(self):
|
9 |
+
"""Set up test environment."""
|
10 |
+
self.test_dir = tempfile.mkdtemp()
|
11 |
+
self.prompt_manager = PromptManager(self.test_dir)
|
12 |
+
|
13 |
+
def tearDown(self):
|
14 |
+
"""Clean up test environment."""
|
15 |
+
shutil.rmtree(self.test_dir)
|
16 |
+
|
17 |
+
def test_create_prompt(self):
|
18 |
+
"""Test creating a prompt."""
|
19 |
+
prompt = self.prompt_manager.create(
|
20 |
+
content="Test prompt {var}",
|
21 |
+
name="Test Prompt",
|
22 |
+
description="A test prompt",
|
23 |
+
tags=["test", "example"]
|
24 |
+
)
|
25 |
+
|
26 |
+
self.assertIsNotNone(prompt)
|
27 |
+
self.assertEqual(prompt.name, "Test Prompt")
|
28 |
+
self.assertEqual(prompt.content, "Test prompt {var}")
|
29 |
+
self.assertEqual(prompt.description, "A test prompt")
|
30 |
+
self.assertEqual(prompt.tags, ["test", "example"])
|
31 |
+
|
32 |
+
def test_get_prompt(self):
|
33 |
+
"""Test getting a prompt."""
|
34 |
+
prompt = self.prompt_manager.create(
|
35 |
+
content="Test prompt",
|
36 |
+
name="Test Prompt"
|
37 |
+
)
|
38 |
+
|
39 |
+
retrieved = self.prompt_manager.get(prompt.id)
|
40 |
+
|
41 |
+
self.assertIsNotNone(retrieved)
|
42 |
+
self.assertEqual(retrieved.id, prompt.id)
|
43 |
+
self.assertEqual(retrieved.name, prompt.name)
|
44 |
+
self.assertEqual(retrieved.content, prompt.content)
|
45 |
+
|
46 |
+
def test_update_prompt(self):
|
47 |
+
"""Test updating a prompt."""
|
48 |
+
prompt = self.prompt_manager.create(
|
49 |
+
content="Test prompt",
|
50 |
+
name="Test Prompt"
|
51 |
+
)
|
52 |
+
|
53 |
+
updated = self.prompt_manager.update(
|
54 |
+
prompt.id,
|
55 |
+
content="Updated prompt",
|
56 |
+
name="Updated Name"
|
57 |
+
)
|
58 |
+
|
59 |
+
self.assertEqual(updated.content, "Updated prompt")
|
60 |
+
self.assertEqual(updated.name, "Updated Name")
|
61 |
+
|
62 |
+
# Check that the update was persisted
|
63 |
+
retrieved = self.prompt_manager.get(prompt.id)
|
64 |
+
self.assertEqual(retrieved.content, "Updated prompt")
|
65 |
+
self.assertEqual(retrieved.name, "Updated Name")
|
66 |
+
|
67 |
+
def test_delete_prompt(self):
|
68 |
+
"""Test deleting a prompt."""
|
69 |
+
prompt = self.prompt_manager.create(
|
70 |
+
content="Test prompt",
|
71 |
+
name="Test Prompt"
|
72 |
+
)
|
73 |
+
|
74 |
+
success = self.prompt_manager.delete(prompt.id)
|
75 |
+
|
76 |
+
self.assertTrue(success)
|
77 |
+
self.assertIsNone(self.prompt_manager.get(prompt.id))
|
78 |
+
|
79 |
+
def test_list_prompts(self):
|
80 |
+
"""Test listing prompts."""
|
81 |
+
self.prompt_manager.create(
|
82 |
+
content="Test prompt 1",
|
83 |
+
name="Test Prompt 1",
|
84 |
+
tags=["test", "one"]
|
85 |
+
)
|
86 |
+
|
87 |
+
self.prompt_manager.create(
|
88 |
+
content="Test prompt 2",
|
89 |
+
name="Test Prompt 2",
|
90 |
+
tags=["test", "two"]
|
91 |
+
)
|
92 |
+
|
93 |
+
all_prompts = self.prompt_manager.list()
|
94 |
+
self.assertEqual(len(all_prompts), 2)
|
95 |
+
|
96 |
+
test_tag_prompts = self.prompt_manager.list(tags=["test"])
|
97 |
+
self.assertEqual(len(test_tag_prompts), 2)
|
98 |
+
|
99 |
+
one_tag_prompts = self.prompt_manager.list(tags=["one"])
|
100 |
+
self.assertEqual(len(one_tag_prompts), 1)
|
101 |
+
self.assertEqual(one_tag_prompts[0].name, "Test Prompt 1")
|
102 |
+
|
103 |
+
def test_render_prompt(self):
|
104 |
+
"""Test rendering a prompt with variables."""
|
105 |
+
prompt = self.prompt_manager.create(
|
106 |
+
content="Hello, {name}! You are a {occupation}.",
|
107 |
+
name="Test Prompt"
|
108 |
+
)
|
109 |
+
|
110 |
+
rendered = prompt.render(name="Alice", occupation="Data Scientist")
|
111 |
+
|
112 |
+
self.assertEqual(rendered, "Hello, Alice! You are a Data Scientist.")
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
unittest.main()
|
promptlab/tests/test_testing.py
ADDED
File without changes
|
promptlab/tests/test_version_control.py
ADDED
File without changes
|
promptlab/utils/__init__.py
ADDED
File without changes
|
promptlab/utils/metrics.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Optional, Any, Union, Callable
|
2 |
+
import re
|
3 |
+
import numpy as np
|
4 |
+
from difflib import SequenceMatcher
|
5 |
+
|
6 |
+
def exact_match(generated: str, expected: str) -> float:
|
7 |
+
"""Calculate exact match score (1.0 if exact match, 0.0 otherwise)."""
|
8 |
+
if not expected or not generated:
|
9 |
+
return 0.0
|
10 |
+
return 1.0 if generated.strip() == expected.strip() else 0.0
|
11 |
+
|
12 |
+
def contains_all(generated: str, items: List[str], case_sensitive: bool = False) -> float:
|
13 |
+
"""Check if generated text contains all items in the list."""
|
14 |
+
if not items:
|
15 |
+
return 0.0
|
16 |
+
|
17 |
+
if not case_sensitive:
|
18 |
+
generated = generated.lower()
|
19 |
+
items = [item.lower() for item in items]
|
20 |
+
|
21 |
+
matches = sum(1 for item in items if item in generated)
|
22 |
+
return matches / len(items)
|
23 |
+
|
24 |
+
def similarity_score(str1: str, str2: str) -> float:
|
25 |
+
"""Calculate string similarity using difflib."""
|
26 |
+
if not str1 or not str2:
|
27 |
+
return 0.0
|
28 |
+
return SequenceMatcher(None, str1, str2).ratio()
|
29 |
+
|
30 |
+
def word_count(text: str) -> int:
|
31 |
+
"""Count words in text."""
|
32 |
+
return len(re.findall(r'\w+', text))
|
33 |
+
|
34 |
+
def length_ratio(generated: str, expected: str) -> float:
|
35 |
+
"""Calculate ratio of generated text length to expected text length."""
|
36 |
+
if not expected:
|
37 |
+
return 0.0
|
38 |
+
|
39 |
+
gen_length = len(generated)
|
40 |
+
exp_length = len(expected)
|
41 |
+
|
42 |
+
# Avoid division by zero
|
43 |
+
if exp_length == 0:
|
44 |
+
return 0.0 if gen_length > 0 else 1.0
|
45 |
+
|
46 |
+
# Return value between 0 and 1, with 1 being perfect match
|
47 |
+
# and decreasing as the ratio diverges from 1
|
48 |
+
ratio = gen_length / exp_length
|
49 |
+
return min(ratio, 1/ratio) if ratio > 0 else 0.0
|
50 |
+
|
51 |
+
def word_overlap(generated: str, expected: str) -> float:
|
52 |
+
"""Calculate the word overlap between generated and expected text."""
|
53 |
+
if not expected or not generated:
|
54 |
+
return 0.0
|
55 |
+
|
56 |
+
gen_words = set(re.findall(r'\w+', generated.lower()))
|
57 |
+
exp_words = set(re.findall(r'\w+', expected.lower()))
|
58 |
+
|
59 |
+
if not exp_words:
|
60 |
+
return 0.0
|
61 |
+
|
62 |
+
intersection = gen_words.intersection(exp_words)
|
63 |
+
return len(intersection) / len(exp_words)
|
64 |
+
|
65 |
+
def keyword_presence(text: str, keywords: List[str], weight: Optional[Dict[str, float]] = None) -> Dict[str, float]:
|
66 |
+
"""Check for presence of keywords with optional weights."""
|
67 |
+
if not keywords:
|
68 |
+
return {"keyword_score": 0.0}
|
69 |
+
|
70 |
+
text = text.lower()
|
71 |
+
result = {}
|
72 |
+
|
73 |
+
total_weight = 0
|
74 |
+
weighted_score = 0
|
75 |
+
|
76 |
+
for keyword in keywords:
|
77 |
+
keyword_lower = keyword.lower()
|
78 |
+
presence = 1.0 if keyword_lower in text else 0.0
|
79 |
+
|
80 |
+
# Apply weight if provided
|
81 |
+
kw_weight = weight.get(keyword, 1.0) if weight else 1.0
|
82 |
+
total_weight += kw_weight
|
83 |
+
weighted_score += presence * kw_weight
|
84 |
+
|
85 |
+
result[f"keyword_{keyword}"] = presence
|
86 |
+
|
87 |
+
# Calculate overall weighted score
|
88 |
+
if total_weight > 0:
|
89 |
+
result["keyword_score"] = weighted_score / total_weight
|
90 |
+
else:
|
91 |
+
result["keyword_score"] = 0.0
|
92 |
+
|
93 |
+
return result
|
94 |
+
|
95 |
+
class MetricsSet:
|
96 |
+
"""A collection of evaluation metrics functions."""
|
97 |
+
def __init__(self):
|
98 |
+
self.metrics = {}
|
99 |
+
|
100 |
+
def add_metric(self, name: str, func: Callable, description: Optional[str] = None) -> None:
|
101 |
+
"""Add a metric function to the set."""
|
102 |
+
self.metrics[name] = {
|
103 |
+
"function": func,
|
104 |
+
"description": description or ""
|
105 |
+
}
|
106 |
+
|
107 |
+
def evaluate(self, generated: str, expected: Optional[str] = None, **kwargs) -> Dict[str, float]:
|
108 |
+
"""Evaluate all metrics on the given text."""
|
109 |
+
results = {}
|
110 |
+
|
111 |
+
for name, metric in self.metrics.items():
|
112 |
+
try:
|
113 |
+
# Different metrics may require different arguments
|
114 |
+
if expected is not None:
|
115 |
+
if "keywords" in kwargs and name == "keyword_presence":
|
116 |
+
result = metric["function"](generated, kwargs["keywords"])
|
117 |
+
else:
|
118 |
+
result = metric["function"](generated, expected)
|
119 |
+
else:
|
120 |
+
result = metric["function"](generated)
|
121 |
+
|
122 |
+
# Handle both single values and dictionaries
|
123 |
+
if isinstance(result, dict):
|
124 |
+
results.update(result)
|
125 |
+
else:
|
126 |
+
results[name] = result
|
127 |
+
except Exception as e:
|
128 |
+
results[name] = 0.0
|
129 |
+
print(f"Error calculating metric {name}: {e}")
|
130 |
+
|
131 |
+
return results
|
132 |
+
|
133 |
+
def create_default_metrics_set() -> MetricsSet:
|
134 |
+
"""Create a MetricsSet with default metrics."""
|
135 |
+
metrics = MetricsSet()
|
136 |
+
|
137 |
+
metrics.add_metric(
|
138 |
+
"exact_match",
|
139 |
+
exact_match,
|
140 |
+
"Exact string match between expected and generated"
|
141 |
+
)
|
142 |
+
|
143 |
+
metrics.add_metric(
|
144 |
+
"similarity",
|
145 |
+
similarity_score,
|
146 |
+
"String similarity using difflib's SequenceMatcher"
|
147 |
+
)
|
148 |
+
|
149 |
+
metrics.add_metric(
|
150 |
+
"word_overlap",
|
151 |
+
word_overlap,
|
152 |
+
"Ratio of words in expected that appear in generated"
|
153 |
+
)
|
154 |
+
|
155 |
+
metrics.add_metric(
|
156 |
+
"length_ratio",
|
157 |
+
length_ratio,
|
158 |
+
"Ratio of generated text length to expected text length"
|
159 |
+
)
|
160 |
+
|
161 |
+
return metrics
|
promptlab/utils/storage.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import shutil
|
4 |
+
from typing import Dict, Any, Optional, List
|
5 |
+
|
6 |
+
class Storage:
|
7 |
+
"""Handles persistent storage for PromptLab."""
|
8 |
+
def __init__(self, base_path: str):
|
9 |
+
self.base_path = base_path
|
10 |
+
os.makedirs(base_path, exist_ok=True)
|
11 |
+
|
12 |
+
def ensure_dir(self, dir_path: str) -> str:
|
13 |
+
"""Ensure directory exists and return its path."""
|
14 |
+
full_path = os.path.join(self.base_path, dir_path)
|
15 |
+
os.makedirs(full_path, exist_ok=True)
|
16 |
+
return full_path
|
17 |
+
|
18 |
+
def save_json(self, dir_path: str, filename: str, data: Dict[str, Any]) -> str:
|
19 |
+
"""Save data to a JSON file."""
|
20 |
+
dir_full_path = self.ensure_dir(dir_path)
|
21 |
+
file_path = os.path.join(dir_full_path, f"{filename}.json")
|
22 |
+
|
23 |
+
with open(file_path, "w") as f:
|
24 |
+
json.dump(data, f, indent=2)
|
25 |
+
|
26 |
+
return file_path
|
27 |
+
|
28 |
+
def load_json(self, dir_path: str, filename: str) -> Optional[Dict[str, Any]]:
|
29 |
+
"""Load data from a JSON file."""
|
30 |
+
file_path = os.path.join(self.base_path, dir_path, f"{filename}.json")
|
31 |
+
|
32 |
+
if not os.path.exists(file_path):
|
33 |
+
return None
|
34 |
+
|
35 |
+
with open(file_path, "r") as f:
|
36 |
+
return json.load(f)
|
37 |
+
|
38 |
+
def list_files(self, dir_path: str, extension: Optional[str] = None) -> List[str]:
|
39 |
+
"""List files in a directory, optionally filtered by extension."""
|
40 |
+
full_path = os.path.join(self.base_path, dir_path)
|
41 |
+
|
42 |
+
if not os.path.exists(full_path):
|
43 |
+
return []
|
44 |
+
|
45 |
+
files = os.listdir(full_path)
|
46 |
+
|
47 |
+
if extension:
|
48 |
+
return [f for f in files if f.endswith(extension)]
|
49 |
+
|
50 |
+
return files
|
51 |
+
|
52 |
+
def delete_file(self, dir_path: str, filename: str) -> bool:
|
53 |
+
"""Delete a file."""
|
54 |
+
file_path = os.path.join(self.base_path, dir_path, filename)
|
55 |
+
|
56 |
+
if os.path.exists(file_path):
|
57 |
+
os.remove(file_path)
|
58 |
+
return True
|
59 |
+
|
60 |
+
return False
|
61 |
+
|
62 |
+
def backup(self, backup_path: Optional[str] = None) -> str:
|
63 |
+
"""Create a backup of the entire storage."""
|
64 |
+
if not backup_path:
|
65 |
+
backup_path = f"{self.base_path}_backup"
|
66 |
+
|
67 |
+
shutil.make_archive(backup_path, "zip", self.base_path)
|
68 |
+
return f"{backup_path}.zip"
|
69 |
+
|
70 |
+
def restore(self, backup_path: str) -> bool:
|
71 |
+
"""Restore from a backup archive."""
|
72 |
+
if not os.path.exists(backup_path):
|
73 |
+
return False
|
74 |
+
|
75 |
+
shutil.rmtree(self.base_path, ignore_errors=True)
|
76 |
+
os.makedirs(self.base_path, exist_ok=True)
|
77 |
+
|
78 |
+
shutil.unpack_archive(backup_path, self.base_path)
|
79 |
+
return True
|
promptlab/utils/templating.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from typing import Dict, Any, List, Optional, Union, Callable
|
4 |
+
from string import Formatter
|
5 |
+
|
6 |
+
class TemplateError(Exception):
|
7 |
+
"""Exception raised for errors in template rendering."""
|
8 |
+
pass
|
9 |
+
|
10 |
+
class PromptTemplate:
|
11 |
+
"""Advanced templating system for prompts."""
|
12 |
+
def __init__(self, template: str):
|
13 |
+
self.template = template
|
14 |
+
self._validate_template()
|
15 |
+
|
16 |
+
def _validate_template(self) -> None:
|
17 |
+
"""Validate template syntax."""
|
18 |
+
try:
|
19 |
+
# Check for basic placeholder syntax
|
20 |
+
list(Formatter().parse(self.template))
|
21 |
+
|
22 |
+
# Check for conditional syntax
|
23 |
+
self._validate_conditionals()
|
24 |
+
|
25 |
+
# Check for loop syntax
|
26 |
+
self._validate_loops()
|
27 |
+
except Exception as e:
|
28 |
+
raise TemplateError(f"Invalid template syntax: {str(e)}")
|
29 |
+
|
30 |
+
def _validate_conditionals(self) -> None:
|
31 |
+
"""Validate conditional blocks in the template."""
|
32 |
+
# Simple validation to ensure if/endif blocks match
|
33 |
+
if_count = len(re.findall(r'\{\s*if\s+.*?\s*\}', self.template))
|
34 |
+
endif_count = len(re.findall(r'\{\s*endif\s*\}', self.template))
|
35 |
+
|
36 |
+
if if_count != endif_count:
|
37 |
+
raise TemplateError(f"Mismatched conditional blocks: {if_count} 'if' and {endif_count} 'endif'")
|
38 |
+
|
39 |
+
def _validate_loops(self) -> None:
|
40 |
+
"""Validate loop blocks in the template."""
|
41 |
+
# Simple validation to ensure for/endfor blocks match
|
42 |
+
for_count = len(re.findall(r'\{\s*for\s+.*?\s*\}', self.template))
|
43 |
+
endfor_count = len(re.findall(r'\{\s*endfor\s*\}', self.template))
|
44 |
+
|
45 |
+
if for_count != endfor_count:
|
46 |
+
raise TemplateError(f"Mismatched loop blocks: {for_count} 'for' and {endfor_count} 'endfor'")
|
47 |
+
|
48 |
+
def _render_conditionals(self, template: str, variables: Dict[str, Any]) -> str:
|
49 |
+
"""Process conditional blocks in the template."""
|
50 |
+
# Handle if-else-endif blocks
|
51 |
+
pattern = r'\{\s*if\s+(.*?)\s*\}(.*?)(?:\{\s*else\s*\}(.*?))?\{\s*endif\s*\}'
|
52 |
+
|
53 |
+
def replace_conditional(match):
|
54 |
+
condition = match.group(1)
|
55 |
+
if_block = match.group(2)
|
56 |
+
else_block = match.group(3) or ""
|
57 |
+
|
58 |
+
# Evaluate condition
|
59 |
+
try:
|
60 |
+
# Replace variables in condition
|
61 |
+
for var_name, var_value in variables.items():
|
62 |
+
if isinstance(var_value, str):
|
63 |
+
# For strings, replace with quoted value
|
64 |
+
condition = condition.replace(var_name, f'"{var_value}"')
|
65 |
+
else:
|
66 |
+
# For other types, replace directly
|
67 |
+
condition = condition.replace(var_name, str(var_value))
|
68 |
+
|
69 |
+
result = eval(condition, {"__builtins__": {}}, variables)
|
70 |
+
return if_block if result else else_block
|
71 |
+
except Exception as e:
|
72 |
+
raise TemplateError(f"Error evaluating condition '{condition}': {str(e)}")
|
73 |
+
|
74 |
+
# Use re.DOTALL to match across multiple lines
|
75 |
+
return re.sub(pattern, replace_conditional, template, flags=re.DOTALL)
|
76 |
+
|
77 |
+
def _render_loops(self, template: str, variables: Dict[str, Any]) -> str:
|
78 |
+
"""Process loop blocks in the template."""
|
79 |
+
# Handle for loops
|
80 |
+
pattern = r'\{\s*for\s+(.*?)\s+in\s+(.*?)\s*\}(.*?)\{\s*endfor\s*\}'
|
81 |
+
|
82 |
+
def replace_loop(match):
|
83 |
+
var_name = match.group(1)
|
84 |
+
iterable_expr = match.group(2)
|
85 |
+
loop_body = match.group(3)
|
86 |
+
|
87 |
+
try:
|
88 |
+
# Get the iterable from variables
|
89 |
+
if iterable_expr in variables and hasattr(variables[iterable_expr], '__iter__'):
|
90 |
+
iterable = variables[iterable_expr]
|
91 |
+
else:
|
92 |
+
# Try to evaluate the expression
|
93 |
+
iterable = eval(iterable_expr, {"__builtins__": {}}, variables)
|
94 |
+
|
95 |
+
if not hasattr(iterable, '__iter__'):
|
96 |
+
raise TemplateError(f"'{iterable_expr}' is not iterable")
|
97 |
+
|
98 |
+
# Process the loop body for each item
|
99 |
+
result = []
|
100 |
+
for item in iterable:
|
101 |
+
# Create a copy of variables with loop variable
|
102 |
+
loop_vars = variables.copy()
|
103 |
+
loop_vars[var_name] = item
|
104 |
+
|
105 |
+
# Process the loop body with the new variables
|
106 |
+
body_content = loop_body
|
107 |
+
for k, v in loop_vars.items():
|
108 |
+
placeholder = f"{{{k}}}"
|
109 |
+
if placeholder in body_content:
|
110 |
+
body_content = body_content.replace(placeholder, str(v))
|
111 |
+
|
112 |
+
result.append(body_content)
|
113 |
+
|
114 |
+
return "".join(result)
|
115 |
+
except Exception as e:
|
116 |
+
raise TemplateError(f"Error processing loop '{match.group(0)}': {str(e)}")
|
117 |
+
|
118 |
+
# Use re.DOTALL to match across multiple lines
|
119 |
+
return re.sub(pattern, replace_loop, template, flags=re.DOTALL)
|
120 |
+
|
121 |
+
def _apply_filters(self, value: Any, filters: List[str]) -> str:
|
122 |
+
"""Apply filters to a value."""
|
123 |
+
result = value
|
124 |
+
for filter_name in filters:
|
125 |
+
if filter_name == "upper":
|
126 |
+
result = str(result).upper()
|
127 |
+
elif filter_name == "lower":
|
128 |
+
result = str(result).lower()
|
129 |
+
elif filter_name == "title":
|
130 |
+
result = str(result).title()
|
131 |
+
elif filter_name == "capitalize":
|
132 |
+
result = str(result).capitalize()
|
133 |
+
elif filter_name == "strip":
|
134 |
+
result = str(result).strip()
|
135 |
+
elif filter_name == "json":
|
136 |
+
result = json.dumps(result)
|
137 |
+
else:
|
138 |
+
raise TemplateError(f"Unknown filter: {filter_name}")
|
139 |
+
return result
|
140 |
+
|
141 |
+
def _render_variables(self, template: str, variables: Dict[str, Any]) -> str:
|
142 |
+
"""Replace variables in the template with their values."""
|
143 |
+
result = template
|
144 |
+
|
145 |
+
# Process variables with filters
|
146 |
+
pattern = r'\{(.*?)(?:\|(.*?))?\}'
|
147 |
+
|
148 |
+
def replace_var(match):
|
149 |
+
var_expr = match.group(1).strip()
|
150 |
+
filters_expr = match.group(2)
|
151 |
+
|
152 |
+
# Extract filters
|
153 |
+
filters = []
|
154 |
+
if filters_expr:
|
155 |
+
filters = [f.strip() for f in filters_expr.split('|')]
|
156 |
+
|
157 |
+
try:
|
158 |
+
# Simple variable
|
159 |
+
if var_expr in variables:
|
160 |
+
value = variables[var_expr]
|
161 |
+
else:
|
162 |
+
# Try to evaluate as an expression
|
163 |
+
try:
|
164 |
+
value = eval(var_expr, {"__builtins__": {}}, variables)
|
165 |
+
except:
|
166 |
+
return match.group(0) # Keep as is if evaluation fails
|
167 |
+
|
168 |
+
# Apply filters
|
169 |
+
return str(self._apply_filters(value, filters))
|
170 |
+
except Exception as e:
|
171 |
+
raise TemplateError(f"Error processing variable '{var_expr}': {str(e)}")
|
172 |
+
|
173 |
+
return re.sub(pattern, replace_var, result)
|
174 |
+
|
175 |
+
def render(self, **kwargs) -> str:
|
176 |
+
"""Render the template with provided variables."""
|
177 |
+
result = self.template
|
178 |
+
|
179 |
+
# Process templates in multiple passes
|
180 |
+
# First, handle conditional blocks
|
181 |
+
result = self._render_conditionals(result, kwargs)
|
182 |
+
|
183 |
+
# Then, handle loops
|
184 |
+
result = self._render_loops(result, kwargs)
|
185 |
+
|
186 |
+
# Finally, handle simple variable substitution
|
187 |
+
result = self._render_variables(result, kwargs)
|
188 |
+
|
189 |
+
return result
|
190 |
+
|
191 |
+
|
192 |
+
class PromptTemplateRegistry:
|
193 |
+
"""Registry for prompt templates."""
|
194 |
+
def __init__(self):
|
195 |
+
self.templates: Dict[str, PromptTemplate] = {}
|
196 |
+
|
197 |
+
def register(self, name: str, template: Union[str, PromptTemplate]) -> None:
|
198 |
+
"""Register a template."""
|
199 |
+
if isinstance(template, str):
|
200 |
+
template = PromptTemplate(template)
|
201 |
+
self.templates[name] = template
|
202 |
+
|
203 |
+
def get(self, name: str) -> Optional[PromptTemplate]:
|
204 |
+
"""Get a template by name."""
|
205 |
+
return self.templates.get(name)
|
206 |
+
|
207 |
+
def render(self, name: str, **kwargs) -> str:
|
208 |
+
"""Render a template by name."""
|
209 |
+
template = self.get(name)
|
210 |
+
if not template:
|
211 |
+
raise ValueError(f"Template '{name}' not found")
|
212 |
+
return template.render(**kwargs)
|
213 |
+
|
214 |
+
def list_templates(self) -> List[str]:
|
215 |
+
"""List all registered templates."""
|
216 |
+
return list(self.templates.keys())
|
217 |
+
|
218 |
+
|
219 |
+
# Create a singleton instance
|
220 |
+
template_registry = PromptTemplateRegistry()
|
221 |
+
|
222 |
+
# Register some common templates
|
223 |
+
template_registry.register(
|
224 |
+
"basic_completion",
|
225 |
+
"""
|
226 |
+
{system_message}
|
227 |
+
|
228 |
+
{user_message}
|
229 |
+
"""
|
230 |
+
)
|
231 |
+
|
232 |
+
template_registry.register(
|
233 |
+
"chat_template",
|
234 |
+
"""
|
235 |
+
{system_message}
|
236 |
+
|
237 |
+
{for message in conversation}
|
238 |
+
{if message.role == "user"}Human: {message.content}
|
239 |
+
{else}Assistant: {message.content}
|
240 |
+
{endif}
|
241 |
+
{endfor}
|
242 |
+
"""
|
243 |
+
)
|
244 |
+
|
245 |
+
template_registry.register(
|
246 |
+
"few_shot",
|
247 |
+
"""
|
248 |
+
{system_message}
|
249 |
+
|
250 |
+
Here are some examples:
|
251 |
+
{for example in examples}
|
252 |
+
Input: {example.input}
|
253 |
+
Output: {example.output}
|
254 |
+
{endfor}
|
255 |
+
|
256 |
+
Input: {input}
|
257 |
+
Output:
|
258 |
+
"""
|
259 |
+
)
|
pyproject.toml
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=42", "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "promptlab"
|
7 |
+
version = "0.1.0"
|
8 |
+
description = "A comprehensive LLM Prompt Management System"
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python = ">=3.7"
|
11 |
+
license = {text = "MIT"}
|
12 |
+
keywords = ["llm", "prompt engineering", "nlp", "machine learning"]
|
13 |
+
authors = [
|
14 |
+
{name = "Biswanath Roul"}
|
15 |
+
]
|
16 |
+
maintainers = [
|
17 |
+
{name = "Biswanath Roul"}
|
18 |
+
]
|
19 |
+
classifiers = [
|
20 |
+
"Development Status :: 3 - Alpha",
|
21 |
+
"Intended Audience :: Developers",
|
22 |
+
"Intended Audience :: Science/Research",
|
23 |
+
"License :: OSI Approved :: MIT License",
|
24 |
+
"Programming Language :: Python :: 3",
|
25 |
+
"Programming Language :: Python :: 3.7",
|
26 |
+
"Programming Language :: Python :: 3.8",
|
27 |
+
"Programming Language :: Python :: 3.9",
|
28 |
+
"Programming Language :: Python :: 3.10",
|
29 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
30 |
+
]
|
31 |
+
dependencies = [
|
32 |
+
"numpy>=1.20.0",
|
33 |
+
]
|
34 |
+
|
35 |
+
[project.urls]
|
36 |
+
"Homepage" = "https://github.com/biswanathroul/promptlab"
|
37 |
+
"Bug Tracker" = "https://github.com/biswanathroul/promptlab/issues"
|
38 |
+
"Documentation" = "https://github.com/biswanathroul/promptlab/wiki"
|
39 |
+
"Source Code" = "https://github.com/biswanathroul/promptlab"
|
40 |
+
|
41 |
+
[project.scripts]
|
42 |
+
promptlab = "promptlab.cli.commands:main"
|
43 |
+
|
44 |
+
[tool.setuptools]
|
45 |
+
packages = ["promptlab", "promptlab.core", "promptlab.cli", "promptlab.utils"]
|