biswanath2.roul commited on May 20

Commit

e54fd17

0 Parent(s):

Initial commit

Browse files

Files changed (32) hide show

.DS_Store +0 -0
.gitignore +60 -0
LICENSE +21 -0
README.md +167 -0
docs/README.md +11 -0
docs/advanced_features.md +268 -0
docs/api_reference.md +247 -0
docs/cli_usage.md +118 -0
docs/getting_started.md +110 -0
docs/integration_examples.md +584 -0
promptlab/__init__.py +39 -0
promptlab/cli/__init__.py +0 -0
promptlab/cli/commands.py +697 -0
promptlab/core/__init__.py +0 -0
promptlab/core/evaluation.py +191 -0
promptlab/core/prompt_manager.py +169 -0
promptlab/core/testing.py +451 -0
promptlab/core/version_control.py +161 -0
promptlab/examples/__init__.py +0 -0
promptlab/examples/ab_testing.py +117 -0
promptlab/examples/basic_usage.py +109 -0
promptlab/examples/evaluation_example.py +95 -0
promptlab/tests/__init__.py +0 -0
promptlab/tests/test_evaluation.py +0 -0
promptlab/tests/test_prompt_manager.py +115 -0
promptlab/tests/test_testing.py +0 -0
promptlab/tests/test_version_control.py +0 -0
promptlab/utils/__init__.py +0 -0
promptlab/utils/metrics.py +161 -0
promptlab/utils/storage.py +79 -0
promptlab/utils/templating.py +259 -0
pyproject.toml +45 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pl200525/
+# Jupyter Notebook
+.ipynb_checkpoints
+# Prompt storage (for local development)
+promptlab_storage/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Biswanath Roul
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,167 @@

+# PromptLab: LLM Prompt Management System
+PromptLab is a comprehensive library for managing, versioning, testing, and evaluating prompts for Large Language Models (LLMs). It provides a structured framework to help data scientists and developers create, optimize, and maintain high-quality prompts.
+## Features
+- **Prompt Management**: Create, update, and organize prompts with metadata and tags
+- **Version Control**: Track prompt changes over time with full version history
+- **A/B Testing**: Compare different prompt variations to find the most effective one
+- **Evaluation Framework**: Measure prompt quality with customizable metrics
+- **Advanced Templating**: Create dynamic prompts with variables, conditionals, and loops
+- **Command-line Interface**: Easily integrate into your workflow
+## Documentation
+For detailed documentation, see the [docs](./docs) directory:
+- [Getting Started](./docs/getting_started.md)
+- [API Reference](./docs/api_reference.md)
+- [CLI Usage](./docs/cli_usage.md)
+- [Advanced Features](./docs/advanced_features.md)
+- [Integration Examples](./docs/integration_examples.md)
+## Installation
+```bash
+pip install promptlab
+Quick Start
+from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
+# Initialize components
+prompt_manager = PromptManager()
+version_control = VersionControl(prompt_manager)
+testing = PromptTesting(prompt_manager)
+evaluator = Evaluator(prompt_manager)
+# Create a prompt
+prompt = prompt_manager.create(
+    content="Summarize the following text: {text}",
+    name="Simple Summarization",
+    description="A simple prompt for text summarization",
+    tags=["summarization", "basic"]
+)
+# Create a new version
+version_control.commit(
+    prompt_id=prompt.id,
+    commit_message="Initial version"
+)
+# Update the prompt
+prompt_manager.update(
+    prompt.id,
+    content="Please provide a concise summary of the following text in 2-3 sentences: {text}"
+)
+# Commit the updated version
+version_control.commit(
+    prompt_id=prompt.id,
+    commit_message="Improved prompt with length guidance"
+)
+# Create a test case
+test_case = testing.create_test_case(
+    prompt_id=prompt.id,
+    input_vars={"text": "Lorem ipsum dolor sit amet..."},
+    expected_output="This is a summary of the text."
+)
+# Define an LLM callback for testing
+async def llm_callback(prompt, vars):
+    # In a real scenario, this would call an actual LLM API
+    return "This is a summary of the text."
+# Run the test case
+import asyncio
+test_result = asyncio.run(testing.run_test_case(
+    test_case_id=test_case.id,
+    llm_callback=llm_callback
+))
+# Evaluate a prompt with multiple inputs
+evaluation_result = asyncio.run(evaluator.evaluate_prompt(
+    prompt_id=prompt.id,
+    inputs=[{"text": "Sample text 1"}, {"text": "Sample text 2"}],
+    llm_callback=llm_callback
+))
+print(f"Evaluation metrics: {evaluation_result['aggregated_metrics']}")
+Command-line Interface
+PromptLab comes with a powerful CLI for managing prompts:
+# Create a prompt
+promptlab prompt create "Summarization" --content "Summarize: {text}" --tags "summarization,basic"
+# List all prompts
+promptlab prompt list
+# Create a new version
+promptlab version commit <prompt_id> --message "Updated prompt"
+# Run tests
+promptlab test run-all <prompt_id> --llm openai
+Advanced Usage
+Advanced Templating
+PromptLab supports advanced templating with conditionals and loops:
+from promptlab import PromptTemplate
+template = PromptTemplate("""
+{system_message}
+{for example in examples}
+Input: {example.input}
+Output: {example.output}
+{endfor}
+Input: {input}
+Output:
+""")
+rendered = template.render(
+    system_message="You are a helpful assistant.",
+    examples=[
+        {"input": "Hello", "output": "Hi there!"},
+        {"input": "How are you?", "output": "I'm doing well, thanks!"}
+    ],
+    input="What's the weather like?"
+)
+Custom Evaluation Metrics
+Create custom metrics to evaluate prompt performance:
+from promptlab import EvaluationMetric, Evaluator
+class CustomMetric(EvaluationMetric):
+    def __init__(self):
+        super().__init__("custom_metric", "My custom evaluation metric")
+    def compute(self, generated_output, expected_output=None, **kwargs):
+        # Custom logic to score the output
+        return score  # A float between 0 and 1
+# Register the custom metric
+evaluator = Evaluator(prompt_manager)
+evaluator.register_metric(CustomMetric())
+Use Cases
+Prompt Development: Iteratively develop and refine prompts with version control
+Prompt Optimization: A/B test different prompt variations to find the most effective approach
+Quality Assurance: Ensure prompt quality with automated testing and evaluation
+Team Collaboration: Share and collaborate on prompts with a centralized management system
+Production Deployment: Maintain consistent prompt quality in production applications
+License
+MIT License
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## Author
+Biswanath Roul - [GitHub](https://github.com/biswanathroul)

docs/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# PromptLab Documentation
+This directory contains detailed documentation for the PromptLab library.
+## Contents
+- [Getting Started](./getting_started.md)
+- [API Reference](./api_reference.md)
+- [CLI Usage](./cli_usage.md)
+- [Advanced Features](./advanced_features.md)
+- [Integration Examples](./integration_examples.md)

docs/advanced_features.md ADDED Viewed

	@@ -0,0 +1,268 @@

+# Advanced Features
+PromptLab provides several advanced features for sophisticated prompt engineering.
+## Advanced Templating
+PromptLab's templating system goes beyond simple variable substitution, offering conditionals and loops.
+### Basic Variable Substitution
+```python
+from promptlab import PromptTemplate
+# Simple variable substitution
+template = PromptTemplate("Hello, {name}!")
+rendered = template.render(name="John")
+# Result: "Hello, John!"
+```
+### Conditional Logic
+```python
+# Conditionals
+template = PromptTemplate("""
+{if is_formal}
+Dear {name},
+I hope this message finds you well.
+{else}
+Hey {name}!
+{endif}
+{message}
+""")
+formal = template.render(is_formal=True, name="Dr. Smith", message="Please review the attached document.")
+casual = template.render(is_formal=False, name="Bob", message="Want to grab lunch?")
+```
+### Loops
+```python
+# Loops
+template = PromptTemplate("""
+Here are your tasks:
+{for task in tasks}
+- {task.priority}: {task.description}
+{endfor}
+""")
+rendered = template.render(tasks=[
+    {"priority": "High", "description": "Complete the report"},
+    {"priority": "Medium", "description": "Schedule meeting"},
+    {"priority": "Low", "description": "Organize files"}
+])
+```
+### Nested Structures
+```python
+# Combining loops and conditionals
+template = PromptTemplate("""
+{system_message}
+{for example in examples}
+User: {example.input}
+{if example.has_reasoning}
+Reasoning: {example.reasoning}
+{endif}
+Assistant: {example.output}
+{endfor}
+User: {query}
+Assistant:
+""")
+rendered = template.render(
+    system_message="You are a helpful assistant.",
+    examples=[
+        {
+            "input": "What's 2+2?",
+            "has_reasoning": True,
+            "reasoning": "Adding 2 and 2 gives 4",
+            "output": "4"
+        },
+        {
+            "input": "Hello",
+            "has_reasoning": False,
+            "output": "Hi there! How can I help you today?"
+        }
+    ],
+    query="What's the capital of France?"
+)
+```
+## Custom Evaluation Metrics
+You can create custom metrics to evaluate prompt outputs based on your specific requirements.
+### Creating a Custom Metric
+```python
+from promptlab import EvaluationMetric
+class RelevanceMetric(EvaluationMetric):
+    """Evaluates relevance of output to a given topic."""
+    def __init__(self, topics):
+        super().__init__("relevance", "Evaluates relevance to specified topics")
+        self.topics = topics
+    def compute(self, generated_output, expected_output=None, **kwargs):
+        """
+        Compute relevance score based on topic presence.
+        Returns a float between 0 and 1.
+        """
+        score = 0
+        output_lower = generated_output.lower()
+        for topic in self.topics:
+            if topic.lower() in output_lower:
+                score += 1
+        # Normalize to 0-1 range
+        return min(1.0, score / len(self.topics)) if self.topics else 0.0
+```
+### Using Custom Metrics
+```python
+from promptlab import Evaluator, PromptManager
+# Initialize components
+prompt_manager = PromptManager()
+evaluator = Evaluator(prompt_manager)
+# Register custom metric
+climate_relevance = RelevanceMetric(["climate", "temperature", "warming", "environment"])
+evaluator.register_metric(climate_relevance)
+# Use in evaluation
+async def my_llm(prompt, vars):
+    # Call your LLM API here
+    return "Climate change is causing global temperature increases..."
+results = await evaluator.evaluate_prompt(
+    prompt_id="abc123",
+    inputs=[{"topic": "climate change"}],
+    llm_callback=my_llm,
+    metric_names=["relevance"]  # Use our custom metric
+)
+print(f"Relevance score: {results['aggregated_metrics']['relevance']}")
+```
+## Customizing Storage
+PromptLab allows you to customize where and how prompts and related data are stored.
+### Custom Storage Locations
+```python
+# Specify a custom storage location
+prompt_manager = PromptManager("/path/to/my/prompts")
+# Export/import prompts
+import json
+# Export a prompt to a file
+prompt = prompt_manager.get("abc123")
+with open("exported_prompt.json", "w") as f:
+    json.dump(prompt.to_dict(), f, indent=2)
+# Import a prompt from a file
+with open("exported_prompt.json", "r") as f:
+    data = json.load(f)
+    imported_prompt = prompt_manager.import_prompt(data)
+```
+## LLM Integration
+PromptLab is designed to work with any LLM through callback functions. Here are examples of integrating with popular LLM APIs.
+### OpenAI Integration
+```python
+import openai
+from promptlab import PromptManager, PromptTesting
+prompt_manager = PromptManager()
+testing = PromptTesting(prompt_manager)
+# Configure OpenAI
+openai.api_key = "your-api-key"
+# OpenAI callback function
+async def openai_callback(prompt, vars):
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.7,
+        max_tokens=150
+    )
+    return response.choices[0].message.content
+# Run tests with OpenAI
+test_results = await testing.run_all_tests("abc123", openai_callback)
+```
+### Anthropic Integration
+```python
+import anthropic
+from promptlab import PromptManager, Evaluator
+prompt_manager = PromptManager()
+evaluator = Evaluator(prompt_manager)
+# Configure Anthropic
+client = anthropic.Anthropic(api_key="your-api-key")
+# Anthropic callback function
+async def anthropic_callback(prompt, vars):
+    response = client.messages.create(
+        model="claude-2",
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=150
+    )
+    return response.content[0].text
+# Evaluate with Anthropic
+eval_results = await evaluator.evaluate_prompt(
+    prompt_id="abc123",
+    inputs=[{"query": "What is machine learning?"}],
+    llm_callback=anthropic_callback
+)
+```
+### Hugging Face Integration
+```python
+from transformers import pipeline
+import asyncio
+from promptlab import PromptManager, VersionControl
+prompt_manager = PromptManager()
+version_control = VersionControl(prompt_manager)
+# Set up Hugging Face pipeline
+generator = pipeline('text-generation', model='gpt2')
+# Hugging Face callback function
+async def hf_callback(prompt, vars):
+    # Run synchronously but in a way that doesn't block the asyncio event loop
+    loop = asyncio.get_event_loop()
+    result = await loop.run_in_executor(None, lambda: generator(prompt, max_length=100)[0]['generated_text'])
+    return result
+# Use with version control
+prompt = prompt_manager.create(
+    content="Complete this: {text}",
+    name="Text Completion"
+)
+version_control.commit(prompt.id, "Initial version")
+# Test with different models by swapping the callback
+```

docs/api_reference.md ADDED Viewed

	@@ -0,0 +1,247 @@

+# API Reference
+This document provides detailed API documentation for the main components of PromptLab.
+## PromptManager
+The `PromptManager` class is the core component for managing prompts.
+```python
+from promptlab import PromptManager
+```
+### Methods
+#### `__init__(storage_path=None)`
+- **Description**: Initialize a new PromptManager.
+- **Parameters**:
+  - `storage_path` (str, optional): Path to store prompts. Defaults to "~/promptlab_storage".
+#### `create(content, name, description='', tags=None, metadata=None)`
+- **Description**: Create a new prompt.
+- **Parameters**:
+  - `content` (str): The prompt text with optional variables in {variable_name} format.
+  - `name` (str): Name of the prompt.
+  - `description` (str, optional): Description of the prompt.
+  - `tags` (list of str, optional): Tags for categorization.
+  - `metadata` (dict, optional): Additional metadata.
+- **Returns**: `Prompt` object.
+#### `get(prompt_id)`
+- **Description**: Get a prompt by ID.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+- **Returns**: `Prompt` object or None if not found.
+#### `update(prompt_id, content=None, name=None, description=None, tags=None, metadata=None)`
+- **Description**: Update a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt to update.
+  - `content` (str, optional): New prompt text.
+  - `name` (str, optional): New name.
+  - `description` (str, optional): New description.
+  - `tags` (list of str, optional): New tags.
+  - `metadata` (dict, optional): New metadata.
+- **Returns**: Updated `Prompt` object.
+#### `delete(prompt_id)`
+- **Description**: Delete a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt to delete.
+- **Returns**: True if deleted, False otherwise.
+#### `list_all()`
+- **Description**: List all prompts.
+- **Returns**: List of `Prompt` objects.
+#### `search_by_tags(tags, match_all=False)`
+- **Description**: Search prompts by tags.
+- **Parameters**:
+  - `tags` (list of str): Tags to search for.
+  - `match_all` (bool, optional): If True, prompt must have all tags.
+- **Returns**: List of matching `Prompt` objects.
+## VersionControl
+The `VersionControl` class manages prompt versions.
+```python
+from promptlab import VersionControl
+```
+### Methods
+#### `__init__(prompt_manager)`
+- **Description**: Initialize the version control system.
+- **Parameters**:
+  - `prompt_manager` (PromptManager): A PromptManager instance.
+#### `commit(prompt_id, commit_message, metadata=None)`
+- **Description**: Create a new version of a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `commit_message` (str): Message describing the changes.
+  - `metadata` (dict, optional): Additional version metadata.
+- **Returns**: Version number (int).
+#### `list_versions(prompt_id)`
+- **Description**: List all versions of a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+- **Returns**: List of version objects.
+#### `get_version(prompt_id, version_number)`
+- **Description**: Get a specific version of a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `version_number` (int): The version number.
+- **Returns**: Version data.
+#### `checkout(prompt_id, version_number)`
+- **Description**: Revert a prompt to a specific version.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `version_number` (int): The version to revert to.
+- **Returns**: Updated `Prompt` object.
+#### `diff(prompt_id, version1, version2)`
+- **Description**: Compare two versions of a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `version1` (int): First version number.
+  - `version2` (int): Second version number.
+- **Returns**: Diff object.
+## PromptTesting
+The `PromptTesting` class provides testing capabilities.
+```python
+from promptlab import PromptTesting
+```
+### Methods
+#### `__init__(prompt_manager)`
+- **Description**: Initialize the testing system.
+- **Parameters**:
+  - `prompt_manager` (PromptManager): A PromptManager instance.
+#### `create_test_case(prompt_id, input_vars, expected_output=None, name=None, description=None)`
+- **Description**: Create a test case for a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt to test.
+  - `input_vars` (dict): Variables to substitute in the prompt.
+  - `expected_output` (str, optional): Expected response.
+  - `name` (str, optional): Test case name.
+  - `description` (str, optional): Test case description.
+- **Returns**: Test case object.
+#### `run_test_case(test_case_id, llm_callback)`
+- **Description**: Run a test case.
+- **Parameters**:
+  - `test_case_id` (str): The ID of the test case.
+  - `llm_callback` (callable): Function to call LLM.
+- **Returns**: Test result.
+#### `run_all_tests(prompt_id, llm_callback)`
+- **Description**: Run all tests for a prompt.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `llm_callback` (callable): Function to call LLM.
+- **Returns**: List of test results.
+#### `ab_test(prompt_id_a, prompt_id_b, test_cases, llm_callback, metrics=None)`
+- **Description**: Run A/B tests comparing two prompts.
+- **Parameters**:
+  - `prompt_id_a` (str): First prompt ID.
+  - `prompt_id_b` (str): Second prompt ID.
+  - `test_cases` (list): Test cases to run.
+  - `llm_callback` (callable): Function to call LLM.
+  - `metrics` (list, optional): Metrics to compare.
+- **Returns**: A/B test results.
+## Evaluator
+The `Evaluator` class handles prompt evaluation.
+```python
+from promptlab import Evaluator
+```
+### Methods
+#### `__init__(prompt_manager)`
+- **Description**: Initialize the evaluator.
+- **Parameters**:
+  - `prompt_manager` (PromptManager): A PromptManager instance.
+#### `register_metric(metric)`
+- **Description**: Register a new evaluation metric.
+- **Parameters**:
+  - `metric` (EvaluationMetric): The metric to register.
+#### `evaluate_prompt(prompt_id, inputs, llm_callback, expected_outputs=None, metric_names=None)`
+- **Description**: Evaluate a prompt with the given inputs and metrics.
+- **Parameters**:
+  - `prompt_id` (str): The ID of the prompt.
+  - `inputs` (list): List of input dictionaries.
+  - `llm_callback` (callable): Function to call LLM.
+  - `expected_outputs` (list, optional): Expected outputs.
+  - `metric_names` (list, optional): Metrics to use.
+- **Returns**: Evaluation results.
+## PromptTemplate
+The `PromptTemplate` class provides advanced templating.
+```python
+from promptlab import PromptTemplate
+```
+### Methods
+#### `__init__(template_string)`
+- **Description**: Initialize a template.
+- **Parameters**:
+  - `template_string` (str): Template with variables, conditionals, and loops.
+#### `render(**variables)`
+- **Description**: Render the template with given variables.
+- **Parameters**:
+  - `variables` (dict): Variables to substitute.
+- **Returns**: Rendered string.
+## EvaluationMetric
+The `EvaluationMetric` is the base class for evaluation metrics.
+```python
+from promptlab import EvaluationMetric
+```
+### Methods
+#### `__init__(name, description=None)`
+- **Description**: Initialize a metric.
+- **Parameters**:
+  - `name` (str): Metric name.
+  - `description` (str, optional): Metric description.
+#### `compute(generated_output, expected_output=None, **kwargs)`
+- **Description**: Compute the metric score.
+- **Parameters**:
+  - `generated_output` (str): Output from LLM.
+  - `expected_output` (str, optional): Expected output.
+  - `**kwargs`: Additional parameters.
+- **Returns**: Score (float between 0 and 1).
+### Built-in Metrics
+- `ExactMatchMetric`: Scores exact matches between generated and expected output.
+- `ContainsKeywordsMetric`: Scores based on keyword presence.
+- `LengthMetric`: Scores based on output length.
+```python
+from promptlab import ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
+```

docs/cli_usage.md ADDED Viewed

	@@ -0,0 +1,118 @@

+# CLI Usage
+PromptLab provides a command-line interface (CLI) for managing prompts, versions, tests, and evaluations.
+## Basic Commands
+### Prompt Management
+```bash
+# Create a prompt
+promptlab prompt create "Weather Forecast" --content "Provide a weather forecast for {location} on {date}" --tags "weather,forecast"
+# List all prompts
+promptlab prompt list
+# Get prompt details
+promptlab prompt get <prompt_id>
+# Update a prompt
+promptlab prompt update <prompt_id> --content "New content" --tags "new,tags"
+# Delete a prompt
+promptlab prompt delete <prompt_id>
+```
+### Version Control
+```bash
+# Commit a version
+promptlab version commit <prompt_id> --message "Version description"
+# List versions
+promptlab version list <prompt_id>
+# Check out (revert to) a specific version
+promptlab version checkout <prompt_id> <version_number>
+# Compare versions
+promptlab version diff <prompt_id> <version1> <version2>
+```
+### Testing
+```bash
+# Create a test case
+promptlab test create <prompt_id> --input '{"location": "New York", "date": "tomorrow"}' --expected "Expected output"
+# List test cases
+promptlab test list <prompt_id>
+# Run a specific test case
+promptlab test run <test_case_id> --llm openai
+# Run all test cases for a prompt
+promptlab test run-all <prompt_id> --llm openai
+# Run an A/B test between two prompts
+promptlab test ab <prompt_id_a> <prompt_id_b> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
+```
+### Evaluation
+```bash
+# Evaluate a prompt
+promptlab eval run <prompt_id> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
+# List available metrics
+promptlab eval metrics
+# Register a custom metric
+promptlab eval register-metric <metric_file.py>
+```
+## Environment Configuration
+The CLI supports environment variables for configuration:
+- `PROMPTLAB_STORAGE`: Path to store prompts and related data
+- `PROMPTLAB_OPENAI_API_KEY`: OpenAI API key for built-in LLM support
+- `PROMPTLAB_DEFAULT_LLM`: Default LLM to use for testing and evaluation
+You can also create a config file at `~/.promptlab/config.json`:
+```json
+{
+  "storage_path": "/path/to/storage",
+  "default_llm": "openai",
+  "api_keys": {
+    "openai": "your-openai-key"
+  }
+}
+```
+## Advanced Usage
+### Multiple Storage Locations
+```bash
+# Specify a storage location for a command
+promptlab --storage /path/to/storage prompt list
+# Export a prompt to another storage
+promptlab prompt export <prompt_id> --output /path/to/output.json
+# Import a prompt from a file
+promptlab prompt import /path/to/prompt.json
+```
+### Automation and Scripting
+```bash
+# Get output in JSON format
+promptlab --json prompt list
+# Use in shell scripts
+PROMPT_ID=$(promptlab --json prompt create "Script Prompt" --content "Content" | jq -r '.id')
+echo "Created prompt with ID: $PROMPT_ID"
+```

docs/getting_started.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# Getting Started with PromptLab
+This guide will help you get started with PromptLab, a comprehensive library for managing LLM prompts.
+## Installation
+```bash
+pip install promptlab
+```
+## Basic Usage
+### Initialize Components
+```python
+from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
+# Initialize with default storage location
+prompt_manager = PromptManager()
+# Or specify a custom storage location
+# prompt_manager = PromptManager("/path/to/storage")
+# Initialize other components
+version_control = VersionControl(prompt_manager)
+testing = PromptTesting(prompt_manager)
+evaluator = Evaluator(prompt_manager)
+```
+### Create and Manage Prompts
+```python
+# Create a prompt
+prompt = prompt_manager.create(
+    content="Translate the following text from {source_language} to {target_language}: {text}",
+    name="Translation Prompt",
+    description="A prompt for translating text between languages",
+    tags=["translation", "multilingual"]
+)
+# The prompt.id property contains a unique identifier (e.g., "a1b2c3d4e5")
+prompt_id = prompt.id
+# Get a prompt by ID
+retrieved_prompt = prompt_manager.get(prompt_id)
+# Update a prompt
+prompt_manager.update(
+    prompt_id,
+    content="Please translate the following text from {source_language} to {target_language}:\n\n{text}"
+)
+# Search prompts by tags
+translation_prompts = prompt_manager.search_by_tags(["translation"])
+# List all prompts
+all_prompts = prompt_manager.list_all()
+```
+### Version Control
+```python
+# Create a version snapshot
+version_control.commit(
+    prompt_id=prompt_id,
+    commit_message="Initial version"
+)
+# Update the prompt and create another version
+prompt_manager.update(
+    prompt_id,
+    content="Please provide a translation of the following text from {source_language} to {target_language}:\n\n{text}\n\nMaintain the original formatting and tone."
+)
+version_control.commit(
+    prompt_id=prompt_id,
+    commit_message="Added formatting instructions"
+)
+# List all versions
+versions = version_control.list_versions(prompt_id)
+# Compare versions
+diff = version_control.diff(prompt_id, 1, 2)
+# Revert to a previous version
+version_control.checkout(prompt_id, 1)
+```
+### Using Prompts with Variables
+```python
+# Get a prompt
+prompt = prompt_manager.get(prompt_id)
+# Render with variables
+rendered_prompt = prompt.render(
+    source_language="English",
+    target_language="Spanish",
+    text="Hello, how are you today?"
+)
+# Now use rendered_prompt with your LLM API
+```
+## Next Steps
+- See the [CLI Usage](./cli_usage.md) guide for command-line operations
+- Explore [Advanced Features](./advanced_features.md) for templating and custom metrics
+- Check [Integration Examples](./integration_examples.md) for real-world use cases

docs/integration_examples.md ADDED Viewed

	@@ -0,0 +1,584 @@

+# Integration Examples
+This document provides concrete examples of integrating PromptLab into various applications and workflows.
+## Customer Support Chatbot
+### Setup
+```python
+from promptlab import PromptManager, VersionControl
+import openai
+# Initialize components
+prompt_manager = PromptManager()
+version_control = VersionControl(prompt_manager)
+# Create prompt templates for different scenarios
+greeting_prompt = prompt_manager.create(
+    content="You are a helpful customer service agent for {company_name}. Greet the customer politely.",
+    name="Customer Greeting",
+    tags=["customer-service", "greeting"]
+)
+inquiry_prompt = prompt_manager.create(
+    content="""
+You are a helpful customer service agent for {company_name}.
+Customer inquiry: {customer_message}
+Based on this inquiry:
+1. Identify the main issue
+2. Provide a helpful response
+3. Offer additional assistance
+Keep your tone professional but friendly.
+""",
+    name="Customer Inquiry Response",
+    tags=["customer-service", "inquiry"]
+)
+# Version them
+version_control.commit(greeting_prompt.id, "Initial version")
+version_control.commit(inquiry_prompt.id, "Initial version")
+# OpenAI callback
+def generate_response(prompt_text):
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": prompt_text}]
+    )
+    return response.choices[0].message.content
+# Main handler function
+def handle_customer_message(customer_name, message, is_new_conversation):
+    if is_new_conversation:
+        # Use greeting prompt for new conversations
+        prompt = prompt_manager.get(greeting_prompt.id)
+        prompt_text = prompt.render(company_name="Acme Inc.")
+        return generate_response(prompt_text)
+    else:
+        # Use inquiry prompt for ongoing conversations
+        prompt = prompt_manager.get(inquiry_prompt.id)
+        prompt_text = prompt.render(
+            company_name="Acme Inc.",
+            customer_message=message
+        )
+        return generate_response(prompt_text)
+```
+## Content Generation System
+### Setup
+```python
+from promptlab import PromptManager, PromptTesting, Evaluator
+import asyncio
+# Initialize components
+prompt_manager = PromptManager("content_system_prompts")
+testing = PromptTesting(prompt_manager)
+evaluator = Evaluator(prompt_manager)
+# Create content generation prompt
+blog_prompt = prompt_manager.create(
+    content="""
+Write a blog post about {topic}.
+Title: {title}
+The post should:
+- Be approximately {word_count} words
+- Be written in a {tone} tone
+- Include {num_sections} main sections
+- Target audience: {audience}
+- Include a compelling call-to-action at the end
+Keywords to include: {keywords}
+""",
+    name="Blog Post Generator",
+    tags=["content", "blog"]
+)
+# Test cases
+test_case = testing.create_test_case(
+    prompt_id=blog_prompt.id,
+    input_vars={
+        "topic": "Sustainable Living",
+        "title": "10 Simple Ways to Reduce Your Carbon Footprint",
+        "word_count": "800",
+        "tone": "informative yet casual",
+        "num_sections": "5",
+        "audience": "environmentally-conscious millennials",
+        "keywords": "sustainability, eco-friendly, carbon footprint, climate change, lifestyle changes"
+    }
+)
+# LLM callback
+async def content_llm_callback(prompt, vars):
+    # Call your preferred LLM API here
+    # This is a placeholder
+    return f"Generated content about {vars.get('topic', 'unknown topic')}"
+# Content generation function
+async def generate_content(content_type, parameters):
+    if content_type == "blog":
+        prompt = prompt_manager.get(blog_prompt.id)
+        rendered_prompt = prompt.render(**parameters)
+        # Generate content
+        content = await content_llm_callback(rendered_prompt, parameters)
+        # Evaluate quality
+        evaluation = await evaluator.evaluate_prompt(
+            prompt_id=blog_prompt.id,
+            inputs=[parameters],
+            llm_callback=content_llm_callback
+        )
+        quality_score = evaluation["aggregated_metrics"].get("length", 0)
+        return {
+            "content": content,
+            "quality_score": quality_score,
+            "metadata": {
+                "prompt_id": blog_prompt.id,
+                "prompt_version": prompt.version,
+                "parameters": parameters
+            }
+        }
+    else:
+        raise ValueError(f"Unsupported content type: {content_type}")
+```
+## AI-Assisted Research Tool
+### Setup
+```python
+from promptlab import PromptManager, VersionControl
+import json
+import openai
+# Initialize components
+prompt_manager = PromptManager("research_prompts")
+version_control = VersionControl(prompt_manager)
+# Create research prompts
+article_summary_prompt = prompt_manager.create(
+    content="""
+Summarize the following research article:
+Title: {article_title}
+Abstract: {article_abstract}
+Provide a summary that:
+1. Identifies the main research question
+2. Outlines the methodology
+3. Summarizes key findings
+4. Highlights limitations
+5. Explains the significance of the results
+Keep the summary concise, approximately 250 words.
+""",
+    name="Article Summarizer",
+    tags=["research", "summary"]
+)
+research_question_prompt = prompt_manager.create(
+    content="""
+Based on the following information:
+Research Area: {research_area}
+Existing Knowledge: {existing_knowledge}
+Observed Gap: {knowledge_gap}
+Generate 5 potential research questions that:
+1. Address the identified knowledge gap
+2. Are specific and answerable
+3. Have theoretical or practical significance
+4. Can be investigated with available research methods
+""",
+    name="Research Question Generator",
+    tags=["research", "question-generation"]
+)
+# Version control
+version_control.commit(article_summary_prompt.id, "Initial version")
+version_control.commit(research_question_prompt.id, "Initial version")
+# OpenAI callback
+def research_assistant(prompt_text):
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=[{"role": "user", "content": prompt_text}]
+    )
+    return response.choices[0].message.content
+# Research functions
+def summarize_article(article_title, article_abstract):
+    prompt = prompt_manager.get(article_summary_prompt.id)
+    prompt_text = prompt.render(
+        article_title=article_title,
+        article_abstract=article_abstract
+    )
+    return research_assistant(prompt_text)
+def generate_research_questions(research_area, existing_knowledge, knowledge_gap):
+    prompt = prompt_manager.get(research_question_prompt.id)
+    prompt_text = prompt.render(
+        research_area=research_area,
+        existing_knowledge=existing_knowledge,
+        knowledge_gap=knowledge_gap
+    )
+    return research_assistant(prompt_text)
+# Save results
+def save_research_data(research_project, data_type, content):
+    # Save the data along with prompt metadata for reproducibility
+    if data_type == "summary":
+        prompt_id = article_summary_prompt.id
+        prompt = prompt_manager.get(prompt_id)
+    elif data_type == "questions":
+        prompt_id = research_question_prompt.id
+        prompt = prompt_manager.get(prompt_id)
+    research_data = {
+        "content": content,
+        "metadata": {
+            "prompt_id": prompt_id,
+            "prompt_version": prompt.version,
+            "timestamp": datetime.datetime.now().isoformat()
+        }
+    }
+    # Save to file (in real application, might save to database)
+    with open(f"{research_project}_{data_type}.json", "w") as f:
+        json.dump(research_data, f, indent=2)
+```
+## Educational Quiz Generator
+### Setup
+```python
+from promptlab import PromptManager, PromptTemplate
+import asyncio
+import aiohttp
+# Initialize components
+prompt_manager = PromptManager("education_prompts")
+# Quiz generation prompt
+quiz_prompt = prompt_manager.create(
+    content="""
+Generate a quiz on the topic of {topic} at a {difficulty_level} difficulty level.
+The quiz should:
+- Have {num_questions} multiple-choice questions
+- Cover the following subtopics: {subtopics}
+- Include {include_explanation} explanations for the correct answers
+- Be appropriate for {grade_level} students
+For each question, provide:
+1. The question text
+2. Four possible answers (A, B, C, D)
+3. The correct answer
+{if include_explanation == "yes"}
+4. An explanation of why the answer is correct
+{endif}
+Format the output as valid JSON.
+""",
+    name="Quiz Generator",
+    tags=["education", "quiz"]
+)
+# Quiz rendering template using advanced templating
+render_template = PromptTemplate("""
+<h1>{quiz_title}</h1>
+<form id="quiz-form">
+{for question in questions}
+  <div class="question">
+    <p><strong>Question {question.number}:</strong> {question.text}</p>
+    <ul style="list-style-type: none;">
+      {for option in question.options}
+      <li>
+        <input type="radio" name="q{question.number}" id="q{question.number}_{option.letter}" value="{option.letter}">
+        <label for="q{question.number}_{option.letter}">{option.letter}. {option.text}</label>
+      </li>
+      {endfor}
+    </ul>
+    {if show_answers}
+    <div class="answer">
+      <p><strong>Correct Answer:</strong> {question.correct_answer}</p>
+      {if question.has_explanation}
+      <p><strong>Explanation:</strong> {question.explanation}</p>
+      {endif}
+    </div>
+    {endif}
+  </div>
+{endfor}
+{if !show_answers}
+<button type="submit">Submit Quiz</button>
+{endif}
+</form>
+""")
+# LLM callback
+async def education_llm_callback(prompt, vars):
+    # This would call your LLM API
+    # Simulated response for this example
+    await asyncio.sleep(1)  # Simulate API call
+    if "quiz" in prompt:
+        return """
+        {
+          "questions": [
+            {
+              "text": "What is the capital of France?",
+              "options": [
+                {"letter": "A", "text": "London"},
+                {"letter": "B", "text": "Berlin"},
+                {"letter": "C", "text": "Paris"},
+                {"letter": "D", "text": "Madrid"}
+              ],
+              "correct_answer": "C",
+              "explanation": "Paris is the capital and most populous city of France."
+            },
+            {
+              "text": "Who wrote 'Romeo and Juliet'?",
+              "options": [
+                {"letter": "A", "text": "Charles Dickens"},
+                {"letter": "B", "text": "William Shakespeare"},
+                {"letter": "C", "text": "Jane Austen"},
+                {"letter": "D", "text": "Mark Twain"}
+              ],
+              "correct_answer": "B",
+              "explanation": "William Shakespeare wrote 'Romeo and Juliet' around 1594-1596."
+            }
+          ]
+        }
+        """
+    return "Default response"
+# Quiz generation function
+async def generate_quiz(topic, difficulty, num_questions, grade_level, include_explanations=True):
+    prompt = prompt_manager.get(quiz_prompt.id)
+    rendered_prompt = prompt.render(
+        topic=topic,
+        difficulty_level=difficulty,
+        num_questions=num_questions,
+        subtopics=", ".join(["key concepts", "historical context", "practical applications"]),
+        include_explanation="yes" if include_explanations else "no",
+        grade_level=grade_level
+    )
+    # Get quiz content from LLM
+    quiz_json = await education_llm_callback(rendered_prompt, {})
+    # Parse JSON
+    quiz_data = json.loads(quiz_json)
+    # Prepare data for HTML template
+    template_data = {
+        "quiz_title": f"{topic} Quiz ({difficulty} Level)",
+        "questions": [],
+        "show_answers": False
+    }
+    # Format questions
+    for i, q in enumerate(quiz_data["questions"]):
+        question = {
+            "number": i + 1,
+            "text": q["text"],
+            "options": q["options"],
+            "correct_answer": q["correct_answer"],
+            "has_explanation": "explanation" in q,
+            "explanation": q.get("explanation", "")
+        }
+        template_data["questions"].append(question)
+    # Render HTML
+    return render_template.render(**template_data)
+```
+## Automated Coding Assistant
+### Setup
+```python
+from promptlab import PromptManager, PromptTesting
+import asyncio
+import subprocess
+import tempfile
+# Initialize components
+prompt_manager = PromptManager("coding_prompts")
+testing = PromptTesting(prompt_manager)
+# Create code generation prompts
+function_prompt = prompt_manager.create(
+    content="""
+Write a {language} function that solves the following problem:
+{problem_description}
+Function signature: {function_signature}
+Requirements:
+- The function should handle edge cases
+- Include appropriate comments
+- Follow {language} best practices
+- Be optimized for {optimization_goal}
+{if include_tests == "yes"}
+Also include unit tests for the function.
+{endif}
+""",
+    name="Function Generator",
+    tags=["coding", "function"]
+)
+bug_fix_prompt = prompt_manager.create(
+    content="""
+Debug the following {language} code which has an issue:
+```{language}
+{buggy_code}
+```
+Error message or problem description:
+{error_description}
+Please:
+1. Identify the issue
+2. Explain the root cause
+3. Provide a fixed version of the code
+4. Suggest how to prevent similar issues
+""",
+    name="Bug Fix Assistant",
+    tags=["coding", "debugging"]
+)
+# LLM callback
+async def coding_llm_callback(prompt, vars):
+    # This would call your LLM API
+    # Simplified example response
+    await asyncio.sleep(1)
+    if "function" in prompt:
+        return """
+        ```python
+        def find_max_subarray_sum(arr):
+            """
+            Finds the maximum sum of any contiguous subarray.
+            Uses Kadane's algorithm with O(n) time complexity.
+            Args:
+                arr: List of integers
+            Returns:
+                Maximum subarray sum
+            """
+            if not arr:
+                return 0
+            current_max = global_max = arr[0]
+            for num in arr[1:]:
+                current_max = max(num, current_max + num)
+                global_max = max(global_max, current_max)
+            return global_max
+        # Unit tests
+        def test_find_max_subarray_sum():
+            assert find_max_subarray_sum([]) == 0
+            assert find_max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]) == 6
+            assert find_max_subarray_sum([-1, -2, -3]) == -1
+            print("All tests passed!")
+        ```
+        """
+    elif "debug" in prompt:
+        return """
+        The issue is a classic off-by-one error in the loop boundary.
+        Root cause:
+        The loop is using `i <= len(arr)` which accesses an index that's out of bounds.
+        Fixed code:
+        ```python
+        def process_array(arr):
+            result = []
+            for i in range(len(arr)):  # Changed from i <= len(arr)
+                result.append(arr[i] * 2)
+            return result
+        ```
+        Prevention:
+        - Remember that array indices are 0-based and go up to len(arr)-1
+        - Use range() or enumerate() when iterating through arrays by index
+        - Add bounds checking for critical operations
+        """
+    return "Default response"
+# Function to test generated code
+def test_generated_code(code, language):
+    """Test the generated code by running it in a safe environment."""
+    if language.lower() == "python":
+        with tempfile.NamedTemporaryFile(suffix=".py") as temp:
+            temp.write(code.encode())
+            temp.flush()
+            try:
+                result = subprocess.run(["python", temp.name],
+                                       capture_output=True,
+                                       text=True,
+                                       timeout=5)
+                if result.returncode == 0:
+                    return {"success": True, "output": result.stdout}
+                else:
+                    return {"success": False, "error": result.stderr}
+            except subprocess.TimeoutExpired:
+                return {"success": False, "error": "Code execution timed out"}
+    return {"success": False, "error": f"Testing not implemented for {language}"}
+# Main coding assistant function
+async def generate_function(problem, language="python", optimization_goal="readability", include_tests=True):
+    function_name = problem.lower().replace(" ", "_").replace("-", "_")
+    signature = f"def {function_name}(parameters):"
+    prompt = prompt_manager.get(function_prompt.id)
+    rendered_prompt = prompt.render(
+        language=language,
+        problem_description=problem,
+        function_signature=signature,
+        optimization_goal=optimization_goal,
+        include_tests="yes" if include_tests else "no"
+    )
+    # Get code from LLM
+    generated_code = await coding_llm_callback(rendered_prompt, {})
+    # Extract code from markdown if present
+    if "```" in generated_code:
+        code_blocks = re.findall(r"```(?:\w+)?\n(.+?)```", generated_code, re.DOTALL)
+        if code_blocks:
+            clean_code = code_blocks[0]
+        else:
+            clean_code = generated_code
+    else:
+        clean_code = generated_code
+    # Test the code
+    test_result = test_generated_code(clean_code, language)
+    return {
+        "code": clean_code,
+        "test_result": test_result,
+        "prompt_id": function_prompt.id
+    }
+```

promptlab/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+PromptLab - A comprehensive LLM Prompt Management System
+PromptLab is a Python library that provides tools for managing, versioning,
+testing, and evaluating prompts for Large Language Models.
+Features:
+- Prompt management with versioning
+- A/B testing for prompt optimization
+- Evaluation framework with customizable metrics
+- Command-line interface for easy integration
+"""
+from .core.prompt_manager import PromptManager, Prompt
+from .core.version_control import VersionControl, PromptVersion
+from .core.testing import PromptTesting, TestCase, TestResult, ABTestResult
+from .core.evaluation import Evaluator, EvaluationMetric, ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
+from .utils.metrics import create_default_metrics_set
+from .utils.templating import PromptTemplate, template_registry
+__version__ = "0.1.0"
+__all__ = [
+    "PromptManager",
+    "Prompt",
+    "VersionControl",
+    "PromptVersion",
+    "PromptTesting",
+    "TestCase",
+    "TestResult",
+    "ABTestResult",
+    "Evaluator",
+    "EvaluationMetric",
+    "ExactMatchMetric",
+    "ContainsKeywordsMetric",
+    "LengthMetric",
+    "create_default_metrics_set",
+    "PromptTemplate",
+    "template_registry"
+]

promptlab/cli/__init__.py ADDED Viewed

File without changes

promptlab/cli/commands.py ADDED Viewed

	@@ -0,0 +1,697 @@

+import argparse
+import sys
+import os
+import json
+from typing import List, Optional, Dict, Any
+import asyncio
+from ..core.prompt_manager import PromptManager
+from ..core.version_control import VersionControl
+from ..core.testing import PromptTesting
+from ..core.evaluation import Evaluator, ContainsKeywordsMetric, LengthMetric
+class CLI:
+    """Command-line interface for PromptLab."""
+    def __init__(self):
+        self.prompt_manager = PromptManager()
+        self.version_control = VersionControl(self.prompt_manager)
+        self.testing = PromptTesting(self.prompt_manager)
+        self.evaluator = Evaluator(self.prompt_manager)
+        self.parser = argparse.ArgumentParser(description="PromptLab - LLM Prompt Management System")
+        self._setup_commands()
+    def _setup_commands(self) -> None:
+        """Set up command-line arguments."""
+        subparsers = self.parser.add_subparsers(dest="command", help="Command")
+        # Prompt commands
+        prompt_parser = subparsers.add_parser("prompt", help="Prompt management")
+        prompt_subparsers = prompt_parser.add_subparsers(dest="subcommand", help="Prompt subcommand")
+        # Create prompt
+        create_parser = prompt_subparsers.add_parser("create", help="Create a new prompt")
+        create_parser.add_argument("name", help="Prompt name")
+        create_parser.add_argument("--content", help="Prompt content")
+        create_parser.add_argument("--file", help="File containing prompt content")
+        create_parser.add_argument("--description", help="Prompt description")
+        create_parser.add_argument("--tags", help="Comma-separated list of tags")
+        # List prompts
+        # List prompts
+        list_parser = prompt_subparsers.add_parser("list", help="List prompts")
+        list_parser.add_argument("--tags", help="Filter by comma-separated list of tags")
+        # Get prompt
+        get_parser = prompt_subparsers.add_parser("get", help="Get a prompt")
+        get_parser.add_argument("id", help="Prompt ID")
+        # Update prompt
+        update_parser = prompt_subparsers.add_parser("update", help="Update a prompt")
+        update_parser.add_argument("id", help="Prompt ID")
+        update_parser.add_argument("--content", help="New prompt content")
+        update_parser.add_argument("--file", help="File containing new prompt content")
+        update_parser.add_argument("--name", help="New prompt name")
+        update_parser.add_argument("--description", help="New prompt description")
+        update_parser.add_argument("--tags", help="New comma-separated list of tags")
+        # Delete prompt
+        delete_parser = prompt_subparsers.add_parser("delete", help="Delete a prompt")
+        delete_parser.add_argument("id", help="Prompt ID")
+        # Version control commands
+        version_parser = subparsers.add_parser("version", help="Version control")
+        version_subparsers = version_parser.add_subparsers(dest="subcommand", help="Version subcommand")
+        # Commit
+        commit_parser = version_subparsers.add_parser("commit", help="Create a new version")
+        commit_parser.add_argument("id", help="Prompt ID")
+        commit_parser.add_argument("--message", help="Commit message")
+        # List versions
+        list_versions_parser = version_subparsers.add_parser("list", help="List versions")
+        list_versions_parser.add_argument("id", help="Prompt ID")
+        # Checkout
+        checkout_parser = version_subparsers.add_parser("checkout", help="Checkout a version")
+        checkout_parser.add_argument("id", help="Prompt ID")
+        checkout_parser.add_argument("version", type=int, help="Version number")
+        # Diff
+        diff_parser = version_subparsers.add_parser("diff", help="Compare versions")
+        diff_parser.add_argument("id", help="Prompt ID")
+        diff_parser.add_argument("version1", type=int, help="First version")
+        diff_parser.add_argument("version2", type=int, help="Second version")
+        # Testing commands
+        test_parser = subparsers.add_parser("test", help="Testing")
+        test_subparsers = test_parser.add_subparsers(dest="subcommand", help="Test subcommand")
+        # Create test case
+        create_test_parser = test_subparsers.add_parser("create", help="Create a test case")
+        create_test_parser.add_argument("prompt_id", help="Prompt ID")
+        create_test_parser.add_argument("--input", help="JSON string of input variables")
+        create_test_parser.add_argument("--input-file", help="File containing JSON input variables")
+        create_test_parser.add_argument("--expected", help="Expected output")
+        create_test_parser.add_argument("--expected-file", help="File containing expected output")
+        create_test_parser.add_argument("--name", help="Test case name")
+        create_test_parser.add_argument("--description", help="Test case description")
+        # List test cases
+        list_tests_parser = test_subparsers.add_parser("list", help="List test cases")
+        list_tests_parser.add_argument("--prompt-id", help="Filter by prompt ID")
+        # Run test case
+        run_test_parser = test_subparsers.add_parser("run", help="Run a test case")
+        run_test_parser.add_argument("test_id", help="Test case ID")
+        run_test_parser.add_argument("--llm", help="LLM callback function to use")
+        # Run all test cases for a prompt
+        run_all_parser = test_subparsers.add_parser("run-all", help="Run all test cases for a prompt")
+        run_all_parser.add_argument("prompt_id", help="Prompt ID")
+        run_all_parser.add_argument("--llm", help="LLM callback function to use")
+        # A/B test
+        ab_test_parser = test_subparsers.add_parser("ab", help="Run an A/B test")
+        ab_test_parser.add_argument("prompt_a", help="Prompt A ID")
+        ab_test_parser.add_argument("prompt_b", help="Prompt B ID")
+        ab_test_parser.add_argument("--llm", help="LLM callback function to use")
+        ab_test_parser.add_argument("--test-cases", help="Comma-separated list of test case IDs")
+        # Evaluation commands
+        eval_parser = subparsers.add_parser("eval", help="Evaluation")
+        eval_subparsers = eval_parser.add_subparsers(dest="subcommand", help="Evaluation subcommand")
+        # List metrics
+        list_metrics_parser = eval_subparsers.add_parser("metrics", help="List evaluation metrics")
+        # Register metric
+        register_metric_parser = eval_subparsers.add_parser("register", help="Register a custom metric")
+        register_metric_parser.add_argument("name", help="Metric name")
+        register_metric_parser.add_argument("--keywords", help="Keywords for ContainsKeywordsMetric")
+        register_metric_parser.add_argument("--min-length", type=int, help="Minimum length for LengthMetric")
+        register_metric_parser.add_argument("--max-length", type=int, help="Maximum length for LengthMetric")
+        register_metric_parser.add_argument("--target-length", type=int, help="Target length for LengthMetric")
+        # Evaluate prompt
+        evaluate_parser = eval_subparsers.add_parser("run", help="Evaluate a prompt")
+        evaluate_parser.add_argument("prompt_id", help="Prompt ID")
+        evaluate_parser.add_argument("--inputs", help="JSON string of input variables list")
+        evaluate_parser.add_argument("--inputs-file", help="File containing JSON input variables list")
+        evaluate_parser.add_argument("--expected", help="JSON string of expected outputs list")
+        evaluate_parser.add_argument("--expected-file", help="File containing JSON expected outputs list")
+        evaluate_parser.add_argument("--metrics", help="Comma-separated list of metrics to use")
+        evaluate_parser.add_argument("--llm", help="LLM callback function to use")
+    def run(self, args: Optional[List[str]] = None) -> None:
+        """Run the CLI with the given arguments."""
+        args = self.parser.parse_args(args)
+        if not args.command:
+            self.parser.print_help()
+            return
+        # Handle commands
+        if args.command == "prompt":
+            self._handle_prompt_command(args)
+        elif args.command == "version":
+            self._handle_version_command(args)
+        elif args.command == "test":
+            self._handle_test_command(args)
+        elif args.command == "eval":
+            self._handle_eval_command(args)
+    def _handle_prompt_command(self, args) -> None:
+        """Handle prompt commands."""
+        if not args.subcommand:
+            return
+        if args.subcommand == "create":
+            # Get content from file or argument
+            content = ""
+            if args.file:
+                with open(args.file, "r") as f:
+                    content = f.read()
+            elif args.content:
+                content = args.content
+            else:
+                print("Error: Must provide either --content or --file")
+                return
+            # Parse tags
+            tags = []
+            if args.tags:
+                tags = [tag.strip() for tag in args.tags.split(",")]
+            # Create prompt
+            prompt = self.prompt_manager.create(
+                content=content,
+                name=args.name,
+                description=args.description,
+                tags=tags
+            )
+            print(f"Created prompt with ID: {prompt.id}")
+        elif args.subcommand == "list":
+            # Parse tags
+            tags = None
+            if args.tags:
+                tags = [tag.strip() for tag in args.tags.split(",")]
+            # List prompts
+            prompts = self.prompt_manager.list(tags)
+            if not prompts:
+                print("No prompts found")
+                return
+            # Print prompts
+            print(f"Found {len(prompts)} prompts:")
+            for prompt in prompts:
+                tags_str = ", ".join(prompt.tags) if prompt.tags else ""
+                print(f"ID: {prompt.id} | Name: {prompt.name} | Tags: {tags_str}")
+        elif args.subcommand == "get":
+            # Get prompt
+            prompt = self.prompt_manager.get(args.id)
+            if not prompt:
+                print(f"Prompt with ID {args.id} not found")
+                return
+            # Print prompt
+            print(f"ID: {prompt.id}")
+            print(f"Name: {prompt.name}")
+            print(f"Description: {prompt.description}")
+            print(f"Tags: {', '.join(prompt.tags)}")
+            print(f"Version: {prompt.version}")
+            print(f"Created: {prompt.created_at}")
+            print(f"Updated: {prompt.updated_at}")
+            print("\nContent:")
+            print(prompt.content)
+        elif args.subcommand == "update":
+            # Get prompt
+            prompt = self.prompt_manager.get(args.id)
+            if not prompt:
+                print(f"Prompt with ID {args.id} not found")
+                return
+            # Update kwargs
+            kwargs = {}
+            if args.name:
+                kwargs["name"] = args.name
+            if args.description:
+                kwargs["description"] = args.description
+            if args.tags:
+                kwargs["tags"] = [tag.strip() for tag in args.tags.split(",")]
+            # Get content from file or argument
+            if args.file:
+                with open(args.file, "r") as f:
+                    kwargs["content"] = f.read()
+            elif args.content:
+                kwargs["content"] = args.content
+            # Update prompt
+            prompt = self.prompt_manager.update(args.id, **kwargs)
+            print(f"Updated prompt with ID: {prompt.id}")
+        elif args.subcommand == "delete":
+            # Delete prompt
+            success = self.prompt_manager.delete(args.id)
+            if success:
+                print(f"Deleted prompt with ID: {args.id}")
+            else:
+                print(f"Prompt with ID {args.id} not found")
+    def _handle_version_command(self, args) -> None:
+        """Handle version control commands."""
+        if not args.subcommand:
+            return
+        if args.subcommand == "commit":
+            # Commit version
+            version = self.version_control.commit(
+                prompt_id=args.id,
+                commit_message=args.message
+            )
+            if not version:
+                print(f"Prompt with ID {args.id} not found")
+                return
+            print(f"Committed version {version.version} for prompt {args.id}")
+        elif args.subcommand == "list":
+            # List versions
+            versions = self.version_control.list_versions(args.id)
+            if not versions:
+                print(f"No versions found for prompt {args.id}")
+                return
+            # Print versions
+            print(f"Found {len(versions)} versions for prompt {args.id}:")
+            for version in versions:
+                message = version.commit_message or "No commit message"
+                print(f"Version: {version.version} | Created: {version.created_at} | Message: {message}")
+        elif args.subcommand == "checkout":
+            # Checkout version
+            prompt = self.version_control.checkout(
+                prompt_id=args.id,
+                version=args.version
+            )
+            if not prompt:
+                print(f"Prompt with ID {args.id} or version {args.version} not found")
+                return
+            print(f"Checked out version {args.version} for prompt {args.id}")
+        elif args.subcommand == "diff":
+            # Diff versions
+            diff = self.version_control.diff(
+                prompt_id=args.id,
+                version1=args.version1,
+                version2=args.version2
+            )
+            if not diff:
+                print(f"Could not compare versions {args.version1} and {args.version2} for prompt {args.id}")
+                return
+            # Print diff
+            print(f"Diff between version {args.version1} and {args.version2} for prompt {args.id}:")
+            for line in diff["diff"]:
+                print(line)
+    def _handle_test_command(self, args) -> None:
+        """Handle testing commands."""
+        if not args.subcommand:
+            return
+        if args.subcommand == "create":
+            # Parse input variables
+            input_vars = {}
+            if args.input:
+                input_vars = json.loads(args.input)
+            elif args.input_file:
+                with open(args.input_file, "r") as f:
+                    input_vars = json.loads(f.read())
+            else:
+                print("Error: Must provide either --input or --input-file")
+                return
+            # Parse expected output
+            expected = None
+            if args.expected:
+                expected = args.expected
+            elif args.expected_file:
+                with open(args.expected_file, "r") as f:
+                    expected = f.read()
+            # Create test case
+            test_case = self.testing.create_test_case(
+                prompt_id=args.prompt_id,
+                input_vars=input_vars,
+                expected_output=expected,
+                name=args.name,
+                description=args.description
+            )
+            print(f"Created test case with ID: {test_case.id}")
+        elif args.subcommand == "list":
+            # List test cases
+            test_cases = self.testing.list_test_cases(args.prompt_id)
+            if not test_cases:
+                print("No test cases found")
+                return
+            # Print test cases
+            print(f"Found {len(test_cases)} test cases:")
+            for tc in test_cases:
+                print(f"ID: {tc.id} | Name: {tc.name} | Prompt ID: {tc.prompt_id}")
+        elif args.subcommand == "run":
+            # Get LLM callback
+            llm_callback = self._get_llm_callback(args.llm)
+            # Run test case
+            asyncio.run(self._run_test_case(args.test_id, llm_callback))
+        elif args.subcommand == "run-all":
+            # Get LLM callback
+            llm_callback = self._get_llm_callback(args.llm)
+            # Run all test cases
+            asyncio.run(self._run_all_test_cases(args.prompt_id, llm_callback))
+        elif args.subcommand == "ab":
+            # Get LLM callback
+            llm_callback = self._get_llm_callback(args.llm)
+            # Parse test case IDs
+            test_cases = None
+            if args.test_cases:
+                test_cases = [tc.strip() for tc in args.test_cases.split(",")]
+            # Run A/B test
+            asyncio.run(self._run_ab_test(args.prompt_a, args.prompt_b, llm_callback, test_cases))
+    async def _run_test_case(self, test_case_id, llm_callback) -> None:
+        """Run a test case."""
+        try:
+            metrics_callbacks = [
+                self._create_metrics_callback("exact_match"),
+                self._create_metrics_callback("similarity"),
+                self._create_metrics_callback("length")
+            ]
+            result = await self.testing.run_test_case(
+                test_case_id=test_case_id,
+                llm_callback=llm_callback,
+                metrics_callbacks=metrics_callbacks
+            )
+            print(f"Test result ID: {result.id}")
+            print(f"Test case ID: {result.test_case_id}")
+            print(f"Prompt ID: {result.prompt_id}")
+            print(f"Prompt version: {result.prompt_version}")
+            print(f"Passed: {result.passed}")
+            if result.metrics:
+                print("\nMetrics:")
+                for name, value in result.metrics.items():
+                    print(f"{name}: {value}")
+            print("\nOutput:")
+            print(result.output)
+        except Exception as e:
+            print(f"Error running test case: {e}")
+    async def _run_all_test_cases(self, prompt_id, llm_callback) -> None:
+        """Run all test cases for a prompt."""
+        try:
+            metrics_callbacks = [
+                self._create_metrics_callback("exact_match"),
+                self._create_metrics_callback("similarity"),
+                self._create_metrics_callback("length")
+            ]
+            results = await self.testing.run_test_cases(
+                prompt_id=prompt_id,
+                llm_callback=llm_callback,
+                metrics_callbacks=metrics_callbacks
+            )
+            print(f"Ran {len(results)} test cases for prompt {prompt_id}")
+            # Calculate aggregate metrics
+            if results:
+                passed = sum(1 for r in results if r.passed)
+                print(f"Passed: {passed}/{len(results)} ({passed/len(results)*100:.2f}%)")
+                # Aggregate metrics
+                metrics = {}
+                for r in results:
+                    for name, value in r.metrics.items():
+                        if name not in metrics:
+                            metrics[name] = []
+                        metrics[name].append(value)
+                print("\nAggregate metrics:")
+                for name, values in metrics.items():
+                    avg = sum(values) / len(values)
+                    print(f"{name}: {avg:.4f}")
+        except Exception as e:
+            print(f"Error running test cases: {e}")
+    async def _run_ab_test(self, prompt_a_id, prompt_b_id, llm_callback, test_cases) -> None:
+        """Run an A/B test."""
+        try:
+            metrics_callbacks = [
+                self._create_metrics_callback("exact_match"),
+                self._create_metrics_callback("similarity"),
+                self._create_metrics_callback("length")
+            ]
+            result = await self.testing.run_ab_test(
+                prompt_a_id=prompt_a_id,
+                prompt_b_id=prompt_b_id,
+                llm_callback=llm_callback,
+                metrics_callbacks=metrics_callbacks,
+                test_cases=test_cases
+            )
+            print(f"A/B test result ID: {result.id}")
+            print(f"Prompt A ID: {result.prompt_a_id}")
+            print(f"Prompt B ID: {result.prompt_b_id}")
+            print(f"Winner: {result.winner or 'Tie'}")
+            print("\nPrompt A metrics:")
+            for name, value in result.metrics_a.items():
+                print(f"{name}: {value:.4f}")
+            print("\nPrompt B metrics:")
+            for name, value in result.metrics_b.items():
+                print(f"{name}: {value:.4f}")
+        except Exception as e:
+            print(f"Error running A/B test: {e}")
+    def _handle_eval_command(self, args) -> None:
+        """Handle evaluation commands."""
+        if not args.subcommand:
+            return
+        if args.subcommand == "metrics":
+            # List metrics
+            metrics = self.evaluator.list_metrics()
+            if not metrics:
+                print("No metrics registered")
+                return
+            # Print metrics
+            print(f"Found {len(metrics)} metrics:")
+            for metric in metrics:
+                print(f"Name: {metric.name} | Description: {metric.description}")
+        elif args.subcommand == "register":
+            # Register custom metric
+            if args.keywords:
+                # Register ContainsKeywordsMetric
+                keywords = [k.strip() for k in args.keywords.split(",")]
+                metric = ContainsKeywordsMetric(keywords)
+                self.evaluator.register_metric(metric)
+                print(f"Registered ContainsKeywordsMetric with name: {metric.name}")
+            elif args.min_length is not None or args.max_length is not None or args.target_length is not None:
+                # Register LengthMetric
+                metric = LengthMetric(
+                    min_length=args.min_length,
+                    max_length=args.max_length,
+                    target_length=args.target_length
+                )
+                self.evaluator.register_metric(metric)
+                print(f"Registered LengthMetric with name: {metric.name}")
+            else:
+                print("Error: Must provide either --keywords, --min-length, --max-length, or --target-length")
+        elif args.subcommand == "run":
+            # Parse inputs
+            inputs = []
+            if args.inputs:
+                inputs = json.loads(args.inputs)
+            elif args.inputs_file:
+                with open(args.inputs_file, "r") as f:
+                    inputs = json.loads(f.read())
+            else:
+                print("Error: Must provide either --inputs or --inputs-file")
+                return
+            # Parse expected outputs
+            expected_outputs = None
+            if args.expected:
+                expected_outputs = json.loads(args.expected)
+            elif args.expected_file:
+                with open(args.expected_file, "r") as f:
+                    expected_outputs = json.loads(f.read())
+            # Parse metrics
+            metric_names = None
+            if args.metrics:
+                metric_names = [m.strip() for m in args.metrics.split(",")]
+            # Get LLM callback
+            llm_callback = self._get_llm_callback(args.llm)
+            # Run evaluation
+            asyncio.run(self._run_evaluation(
+                args.prompt_id,
+                inputs,
+                expected_outputs,
+                metric_names,
+                llm_callback
+            ))
+    async def _run_evaluation(self, prompt_id, inputs, expected_outputs, metric_names, llm_callback) -> None:
+        """Run an evaluation."""
+        try:
+            result = await self.evaluator.evaluate_prompt(
+                prompt_id=prompt_id,
+                inputs=inputs,
+                llm_callback=llm_callback,
+                expected_outputs=expected_outputs,
+                metric_names=metric_names
+            )
+            print(f"Evaluated prompt {prompt_id} with {result['num_samples']} samples")
+            # Print aggregated metrics
+            print("\nAggregated metrics:")
+            for name, value in result["aggregated_metrics"].items():
+                print(f"{name}: {value:.4f}")
+            # Print individual results
+            print("\nIndividual results:")
+            for i, r in enumerate(result["individual_results"]):
+                print(f"\nSample {i+1}:")
+                print(f"Input: {json.dumps(r['input'])}")
+                print(f"Output: {r['output']}")
+                if r["expected"]:
+                    print(f"Expected: {r['expected']}")
+                print("Metrics:")
+                for name, value in r["metrics"].items():
+                    print(f"{name}: {value:.4f}")
+        except Exception as e:
+            print(f"Error running evaluation: {e}")
+    def _get_llm_callback(self, llm_name: Optional[str]) -> callable:
+        """Get an LLM callback function."""
+        # Default to a simple echo function for testing
+        if not llm_name or llm_name == "echo":
+            async def echo_callback(prompt, vars):
+                return f"Echo: {prompt}"
+            return echo_callback
+        # Add more LLM callbacks as needed
+        if llm_name == "openai":
+            # Example implementation using OpenAI
+            try:
+                import openai
+                async def openai_callback(prompt, vars):
+                    response = await openai.Completion.acreate(
+                        model="text-davinci-003",
+                        prompt=prompt,
+                        max_tokens=1000
+                    )
+                    return response.choices[0].text.strip()
+                return openai_callback
+            except ImportError:
+                print("Error: OpenAI package not installed. Run `pip install openai` to use this LLM.")
+                sys.exit(1)
+        # Add more LLM implementations as needed
+        print(f"Error: Unknown LLM callback: {llm_name}")
+        sys.exit(1)
+    def _create_metrics_callback(self, metric_type: str) -> callable:
+        """Create a metrics callback function."""
+        # Simple metrics
+        if metric_type == "exact_match":
+            def exact_match_callback(output, expected):
+                if not expected:
+                    return {"exact_match": 0.0}
+                return {"exact_match": 1.0 if output.strip() == expected.strip() else 0.0}
+            return exact_match_callback
+        elif metric_type == "similarity":
+            from difflib import SequenceMatcher
+            def similarity_callback(output, expected):
+                if not expected:
+                    return {"similarity": 0.0}
+                return {"similarity": SequenceMatcher(None, output, expected).ratio()}
+            return similarity_callback
+        elif metric_type == "length":
+            def length_callback(output, expected):
+                out_len = len(output)
+                if not expected:
+                    return {"length": 1.0 if out_len > 0 else 0.0}
+                exp_len = len(expected)
+                if exp_len == 0:
+                    return {"length": 1.0 if out_len == 0 else 0.0}
+                # Return score inversely proportional to the difference
+                ratio = min(out_len / exp_len, exp_len / out_len)
+                return {"length": ratio}
+            return length_callback
+        # Default no-op metric
+        return lambda output, expected: {}
+def main():
+    """Main entry point for the CLI."""
+    CLI().run()
+if __name__ == "__main__":
+    main()

promptlab/core/__init__.py ADDED Viewed

File without changes

promptlab/core/evaluation.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import json
+import datetime
+from typing import Dict, List, Optional, Any, Callable, Union, Awaitable
+import asyncio
+from .prompt_manager import PromptManager, Prompt
+class EvaluationMetric:
+    """Base class for evaluation metrics."""
+    def __init__(self, name: str, description: Optional[str] = None):
+        self.name = name
+        self.description = description or ""
+    def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
+        """Compute the metric. Must be implemented by subclasses."""
+        raise NotImplementedError("Subclasses must implement compute method")
+class ExactMatchMetric(EvaluationMetric):
+    """Evaluates exact match between generated and expected output."""
+    def __init__(self):
+        super().__init__("exact_match", "Exact match between generated and expected output")
+    def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
+        """Return 1.0 if generated matches expected exactly, 0.0 otherwise."""
+        if not expected_output:
+            return 0.0
+        return 1.0 if generated_output.strip() == expected_output.strip() else 0.0
+class ContainsKeywordsMetric(EvaluationMetric):
+    """Evaluates if the generated output contains specified keywords."""
+    def __init__(self, keywords: List[str], case_sensitive: bool = False):
+        super().__init__(
+            "contains_keywords",
+            f"Check if output contains keywords: {', '.join(keywords)}"
+        )
+        self.keywords = keywords
+        self.case_sensitive = case_sensitive
+    def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
+        """Return percentage of keywords found in the output."""
+        if not self.keywords:
+            return 0.0
+        if not self.case_sensitive:
+            generated_output = generated_output.lower()
+            keywords = [k.lower() for k in self.keywords]
+        else:
+            keywords = self.keywords
+        matches = sum(1 for k in keywords if k in generated_output)
+        return matches / len(keywords)
+class LengthMetric(EvaluationMetric):
+    """Evaluates if the generated output length is within the desired range."""
+    def __init__(self, min_length: Optional[int] = None, max_length: Optional[int] = None, target_length: Optional[int] = None):
+        description = "Evaluate output length"
+        if target_length is not None:
+            description = f"Evaluate if output length is close to {target_length} characters"
+        elif min_length is not None and max_length is not None:
+            description = f"Evaluate if output length is between {min_length} and {max_length} characters"
+        elif min_length is not None:
+            description = f"Evaluate if output length is at least {min_length} characters"
+        elif max_length is not None:
+            description = f"Evaluate if output length is at most {max_length} characters"
+        super().__init__("length", description)
+        self.min_length = min_length
+        self.max_length = max_length
+        self.target_length = target_length
+    def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
+        """Return score based on length conditions."""
+        length = len(generated_output)
+        if self.target_length is not None:
+            # Score inversely proportional to the distance from target
+            max_distance = self.target_length  # Normalize to a max distance
+            distance = abs(length - self.target_length)
+            return max(0, 1 - (distance / max_distance))
+        # Check if within bounds
+        within_min = self.min_length is None or length >= self.min_length
+        within_max = self.max_length is None or length <= self.max_length
+        if within_min and within_max:
+            return 1.0
+        elif within_min and self.max_length:
+            # Over max length, calculate proportional penalty
+            return max(0, 1 - ((length - self.max_length) / self.max_length))
+        elif within_max and self.min_length:
+            # Under min length, calculate proportional penalty
+            return max(0, length / self.min_length)
+        return 0.0
+class Evaluator:
+    """Manages evaluation metrics and evaluation runs."""
+    def __init__(self, prompt_manager: PromptManager):
+        self.prompt_manager = prompt_manager
+        self.metrics: Dict[str, EvaluationMetric] = {}
+        self.storage_path = os.path.join(prompt_manager.storage_path, "evaluations")
+        os.makedirs(self.storage_path, exist_ok=True)
+        # Register built-in metrics
+        self.register_metric(ExactMatchMetric())
+        self.register_metric(ContainsKeywordsMetric(["important", "critical", "necessary"]))
+        self.register_metric(LengthMetric(min_length=50, max_length=500))
+    def register_metric(self, metric: EvaluationMetric) -> None:
+        """Register a new evaluation metric."""
+        self.metrics[metric.name] = metric
+    def get_metric(self, name: str) -> Optional[EvaluationMetric]:
+        """Get a registered metric by name."""
+        return self.metrics.get(name)
+    def list_metrics(self) -> List[EvaluationMetric]:
+        """List all registered metrics."""
+        return list(self.metrics.values())
+    async def evaluate_prompt(
+        self,
+        prompt_id: str,
+        inputs: List[Dict[str, Any]],
+        llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
+        expected_outputs: Optional[List[Optional[str]]] = None,
+        metric_names: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """Evaluate a prompt with the given inputs and metrics."""
+        prompt = self.prompt_manager.get(prompt_id)
+        if not prompt:
+            raise ValueError(f"Prompt with ID {prompt_id} not found")
+        # Use all registered metrics if none specified
+        if not metric_names:
+            metrics_to_use = list(self.metrics.values())
+        else:
+            metrics_to_use = [self.get_metric(name) for name in metric_names if self.get_metric(name)]
+        if not metrics_to_use:
+            raise ValueError("No valid metrics specified")
+        # Ensure expected_outputs is the same length as inputs
+        if expected_outputs is None:
+            expected_outputs = [None] * len(inputs)
+        elif len(expected_outputs) != len(inputs):
+            raise ValueError("Expected outputs must be the same length as inputs")
+        results = []
+        for i, (input_vars, expected) in enumerate(zip(inputs, expected_outputs)):
+            # Render the prompt
+            rendered_prompt = prompt.render(**input_vars)
+            # Generate output
+            if asyncio.iscoroutinefunction(llm_callback):
+                output = await llm_callback(rendered_prompt, input_vars)
+            else:
+                output = llm_callback(rendered_prompt, input_vars)
+            # Compute metrics
+            metrics_results = {}
+            for metric in metrics_to_use:
+                metrics_results[metric.name] = metric.compute(output, expected, **input_vars)
+            results.append({
+                "input": input_vars,
+                "output": output,
+                "expected": expected,
+                "metrics": metrics_results
+            })
+        # Aggregate metrics
+        aggregated_metrics = {}
+        for metric in metrics_to_use:
+            values = [r["metrics"][metric.name] for r in results]
+            aggregated_metrics[metric.name] = sum(values) / len(values) if values else 0
+        evaluation_result = {
+            "prompt_id": prompt_id,
+            "prompt_version": prompt.version,
+            "num_samples": len(inputs),
+            "aggregated_metrics": aggregated_metrics,
+            "individual_results": results
+        }
+        # Save evaluation result
+        timestamp = datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-")
+        file_path = os.path.join(self.storage_path, f"eval_{prompt_id}_{timestamp}.json")
+        with open(file_path, "w") as f:
+            json.dump(evaluation_result, f, indent=2)
+        return evaluation_result

promptlab/core/prompt_manager.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import json
+import hashlib
+import datetime
+from typing import Dict, List, Optional, Union, Any
+class Prompt:
+    def __init__(
+        self,
+        content: str,
+        name: str,
+        description: Optional[str] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ):
+        self.content = content
+        self.name = name
+        self.description = description or ""
+        self.tags = tags or []
+        self.metadata = metadata or {}
+        self.created_at = datetime.datetime.now().isoformat()
+        self.updated_at = self.created_at
+        self.id = self._generate_id()
+        self.version = 1
+    def _generate_id(self) -> str:
+        """Generate a unique ID based on content and name."""
+        unique_string = f"{self.name}:{self.content}:{self.created_at}"
+        return hashlib.md5(unique_string.encode()).hexdigest()[:10]
+    def update(self, content: Optional[str] = None, **kwargs) -> None:
+        """Update prompt attributes."""
+        if content is not None:
+            self.content = content
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        self.updated_at = datetime.datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert prompt to dictionary."""
+        return {
+            "id": self.id,
+            "name": self.name,
+            "content": self.content,
+            "description": self.description,
+            "tags": self.tags,
+            "metadata": self.metadata,
+            "created_at": self.created_at,
+            "updated_at": self.updated_at,
+            "version": self.version
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "Prompt":
+        """Create prompt from dictionary."""
+        prompt = cls(
+            content=data["content"],
+            name=data["name"],
+            description=data.get("description", ""),
+            tags=data.get("tags", []),
+            metadata=data.get("metadata", {})
+        )
+        prompt.id = data["id"]
+        prompt.created_at = data["created_at"]
+        prompt.updated_at = data["updated_at"]
+        prompt.version = data["version"]
+        return prompt
+    def render(self, **kwargs) -> str:
+        """Render prompt with provided variables."""
+        rendered = self.content
+        for key, value in kwargs.items():
+            placeholder = f"{{{key}}}"
+            rendered = rendered.replace(placeholder, str(value))
+        return rendered
+class PromptManager:
+    def __init__(self, storage_path: Optional[str] = None):
+        self.storage_path = storage_path or os.path.join(os.getcwd(), "promptlab_storage")
+        self.prompts: Dict[str, Prompt] = {}
+        self._ensure_storage_dir()
+        self._load_prompts()
+    def _ensure_storage_dir(self) -> None:
+        """Ensure storage directory exists."""
+        os.makedirs(self.storage_path, exist_ok=True)
+    def _load_prompts(self) -> None:
+        """Load prompts from storage."""
+        prompts_dir = os.path.join(self.storage_path, "prompts")
+        if not os.path.exists(prompts_dir):
+            os.makedirs(prompts_dir)
+            return
+        for filename in os.listdir(prompts_dir):
+            if filename.endswith(".json"):
+                with open(os.path.join(prompts_dir, filename), "r") as f:
+                    prompt_data = json.load(f)
+                    prompt = Prompt.from_dict(prompt_data)
+                    self.prompts[prompt.id] = prompt
+    def _save_prompt(self, prompt: Prompt) -> None:
+        """Save prompt to storage."""
+        prompts_dir = os.path.join(self.storage_path, "prompts")
+        os.makedirs(prompts_dir, exist_ok=True)
+        prompt_path = os.path.join(prompts_dir, f"{prompt.id}.json")
+        with open(prompt_path, "w") as f:
+            json.dump(prompt.to_dict(), f, indent=2)
+    def create(
+        self,
+        content: str,
+        name: str,
+        description: Optional[str] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> Prompt:
+        """Create a new prompt."""
+        prompt = Prompt(
+            content=content,
+            name=name,
+            description=description,
+            tags=tags,
+            metadata=metadata
+        )
+        self.prompts[prompt.id] = prompt
+        self._save_prompt(prompt)
+        return prompt
+    def get(self, prompt_id: str) -> Optional[Prompt]:
+        """Get prompt by ID."""
+        return self.prompts.get(prompt_id)
+    def update(self, prompt_id: str, **kwargs) -> Optional[Prompt]:
+        """Update prompt by ID."""
+        prompt = self.get(prompt_id)
+        if prompt:
+            prompt.update(**kwargs)
+            self._save_prompt(prompt)
+        return prompt
+    def delete(self, prompt_id: str) -> bool:
+        """Delete prompt by ID."""
+        if prompt_id in self.prompts:
+            del self.prompts[prompt_id]
+            prompt_path = os.path.join(self.storage_path, "prompts", f"{prompt_id}.json")
+            if os.path.exists(prompt_path):
+                os.remove(prompt_path)
+            return True
+        return False
+    def list(self, tags: Optional[List[str]] = None) -> List[Prompt]:
+        """List prompts, optionally filtered by tags."""
+        if tags:
+            return [p for p in self.prompts.values() if any(tag in p.tags for tag in tags)]
+        return list(self.prompts.values())
+    def search(self, query: str) -> List[Prompt]:
+        """Search prompts by name or content."""
+        query = query.lower()
+        return [
+            p for p in self.prompts.values()
+            if query in p.name.lower() or query in p.content.lower()
+        ]

promptlab/core/testing.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import os
+import json
+import uuid
+import datetime
+import asyncio
+from typing import Dict, List, Optional, Any, Callable, Union, Awaitable, Tuple
+from .prompt_manager import Prompt, PromptManager
+class TestCase:
+    """Represents a test case for a prompt."""
+    def __init__(
+        self,
+        prompt_id: str,
+        input_vars: Dict[str, Any],
+        expected_output: Optional[str] = None,
+        name: Optional[str] = None,
+        description: Optional[str] = None
+    ):
+        self.id = str(uuid.uuid4())[:10]
+        self.prompt_id = prompt_id
+        self.input_vars = input_vars
+        self.expected_output = expected_output
+        self.name = name or f"Test case {self.id}"
+        self.description = description or ""
+        self.created_at = datetime.datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert test case to dictionary."""
+        return {
+            "id": self.id,
+            "prompt_id": self.prompt_id,
+            "input_vars": self.input_vars,
+            "expected_output": self.expected_output,
+            "name": self.name,
+            "description": self.description,
+            "created_at": self.created_at
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TestCase":
+        """Create test case from dictionary."""
+        test_case = cls(
+            prompt_id=data["prompt_id"],
+            input_vars=data["input_vars"],
+            expected_output=data.get("expected_output"),
+            name=data.get("name"),
+            description=data.get("description")
+        )
+        test_case.id = data["id"]
+        test_case.created_at = data["created_at"]
+        return test_case
+class TestResult:
+    """Represents the result of a test case execution."""
+    def __init__(
+        self,
+        test_case_id: str,
+        prompt_id: str,
+        prompt_version: int,
+        output: str,
+        passed: Optional[bool] = None,
+        metrics: Optional[Dict[str, float]] = None
+    ):
+        self.id = str(uuid.uuid4())[:10]
+        self.test_case_id = test_case_id
+        self.prompt_id = prompt_id
+        self.prompt_version = prompt_version
+        self.output = output
+        self.passed = passed
+        self.metrics = metrics or {}
+        self.created_at = datetime.datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert test result to dictionary."""
+        return {
+            "id": self.id,
+            "test_case_id": self.test_case_id,
+            "prompt_id": self.prompt_id,
+            "prompt_version": self.prompt_version,
+            "output": self.output,
+            "passed": self.passed,
+            "metrics": self.metrics,
+            "created_at": self.created_at
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TestResult":
+        """Create test result from dictionary."""
+        return cls(
+            test_case_id=data["test_case_id"],
+            prompt_id=data["prompt_id"],
+            prompt_version=data["prompt_version"],
+            output=data["output"],
+            passed=data.get("passed"),
+            metrics=data.get("metrics", {})
+        )
+class ABTestResult:
+    """Represents the result of an A/B test."""
+    def __init__(
+        self,
+        prompt_a_id: str,
+        prompt_b_id: str,
+        prompt_a_version: int,
+        prompt_b_version: int,
+        metrics_a: Dict[str, float],
+        metrics_b: Dict[str, float],
+        winner: Optional[str] = None
+    ):
+        self.id = str(uuid.uuid4())[:10]
+        self.prompt_a_id = prompt_a_id
+        self.prompt_b_id = prompt_b_id
+        self.prompt_a_version = prompt_a_version
+        self.prompt_b_version = prompt_b_version
+        self.metrics_a = metrics_a
+        self.metrics_b = metrics_b
+        self.winner = winner
+        self.created_at = datetime.datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert A/B test result to dictionary."""
+        return {
+            "id": self.id,
+            "prompt_a_id": self.prompt_a_id,
+            "prompt_b_id": self.prompt_b_id,
+            "prompt_a_version": self.prompt_a_version,
+            "prompt_b_version": self.prompt_b_version,
+            "metrics_a": self.metrics_a,
+            "metrics_b": self.metrics_b,
+            "winner": self.winner,
+            "created_at": self.created_at
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ABTestResult":
+        """Create A/B test result from dictionary."""
+        return cls(
+            prompt_a_id=data["prompt_a_id"],
+            prompt_b_id=data["prompt_b_id"],
+            prompt_a_version=data["prompt_a_version"],
+            prompt_b_version=data["prompt_b_version"],
+            metrics_a=data["metrics_a"],
+            metrics_b=data["metrics_b"],
+            winner=data.get("winner")
+        )
+class PromptTesting:
+    """Manages testing for prompts."""
+    def __init__(self, prompt_manager: PromptManager):
+        self.prompt_manager = prompt_manager
+        self.storage_path = os.path.join(prompt_manager.storage_path, "tests")
+        os.makedirs(self.storage_path, exist_ok=True)
+        # Storage paths
+        self.test_cases_path = os.path.join(self.storage_path, "test_cases")
+        self.test_results_path = os.path.join(self.storage_path, "test_results")
+        self.ab_test_results_path = os.path.join(self.storage_path, "ab_test_results")
+        os.makedirs(self.test_cases_path, exist_ok=True)
+        os.makedirs(self.test_results_path, exist_ok=True)
+        os.makedirs(self.ab_test_results_path, exist_ok=True)
+        self.test_cases: Dict[str, TestCase] = {}
+        self.test_results: Dict[str, TestResult] = {}
+        self.ab_test_results: Dict[str, ABTestResult] = {}
+        self._load_test_cases()
+        self._load_test_results()
+        self._load_ab_test_results()
+    def _load_test_cases(self) -> None:
+        """Load test cases from storage."""
+        for filename in os.listdir(self.test_cases_path):
+            if filename.endswith(".json"):
+                with open(os.path.join(self.test_cases_path, filename), "r") as f:
+                    data = json.load(f)
+                    test_case = TestCase.from_dict(data)
+                    self.test_cases[test_case.id] = test_case
+    def _load_test_results(self) -> None:
+        """Load test results from storage."""
+        for filename in os.listdir(self.test_results_path):
+            if filename.endswith(".json"):
+                with open(os.path.join(self.test_results_path, filename), "r") as f:
+                    data = json.load(f)
+                    test_result = TestResult.from_dict(data)
+                    self.test_results[test_result.id] = test_result
+    def _load_ab_test_results(self) -> None:
+        """Load A/B test results from storage."""
+        for filename in os.listdir(self.ab_test_results_path):
+            if filename.endswith(".json"):
+                with open(os.path.join(self.ab_test_results_path, filename), "r") as f:
+                    data = json.load(f)
+                    ab_test_result = ABTestResult.from_dict(data)
+                    self.ab_test_results[ab_test_result.id] = ab_test_result
+    def _save_test_case(self, test_case: TestCase) -> None:
+        """Save test case to storage."""
+        file_path = os.path.join(self.test_cases_path, f"{test_case.id}.json")
+        with open(file_path, "w") as f:
+            json.dump(test_case.to_dict(), f, indent=2)
+    def _save_test_result(self, test_result: TestResult) -> None:
+        """Save test result to storage."""
+        file_path = os.path.join(self.test_results_path, f"{test_result.id}.json")
+        with open(file_path, "w") as f:
+            json.dump(test_result.to_dict(), f, indent=2)
+    def _save_ab_test_result(self, ab_test_result: ABTestResult) -> None:
+        """Save A/B test result to storage."""
+        file_path = os.path.join(self.ab_test_results_path, f"{ab_test_result.id}.json")
+        with open(file_path, "w") as f:
+            json.dump(ab_test_result.to_dict(), f, indent=2)
+    def create_test_case(
+        self,
+        prompt_id: str,
+        input_vars: Dict[str, Any],
+        expected_output: Optional[str] = None,
+        name: Optional[str] = None,
+        description: Optional[str] = None
+    ) -> TestCase:
+        """Create a test case for a prompt."""
+        test_case = TestCase(
+            prompt_id=prompt_id,
+            input_vars=input_vars,
+            expected_output=expected_output,
+            name=name,
+            description=description
+        )
+        self.test_cases[test_case.id] = test_case
+        self._save_test_case(test_case)
+        return test_case
+    def get_test_case(self, test_case_id: str) -> Optional[TestCase]:
+        """Get a test case by ID."""
+        return self.test_cases.get(test_case_id)
+    def list_test_cases(self, prompt_id: Optional[str] = None) -> List[TestCase]:
+        """List test cases, optionally filtered by prompt ID."""
+        if prompt_id:
+            return [tc for tc in self.test_cases.values() if tc.prompt_id == prompt_id]
+        return list(self.test_cases.values())
+    def delete_test_case(self, test_case_id: str) -> bool:
+        """Delete a test case by ID."""
+        if test_case_id in self.test_cases:
+            del self.test_cases[test_case_id]
+            file_path = os.path.join(self.test_cases_path, f"{test_case_id}.json")
+            if os.path.exists(file_path):
+                os.remove(file_path)
+            return True
+        return False
+    async def run_test_case(
+        self,
+        test_case_id: str,
+        llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
+        metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
+    ) -> TestResult:
+        """Run a test case with the given LLM callback."""
+        test_case = self.get_test_case(test_case_id)
+        if not test_case:
+            raise ValueError(f"Test case with ID {test_case_id} not found")
+        prompt = self.prompt_manager.get(test_case.prompt_id)
+        if not prompt:
+            raise ValueError(f"Prompt with ID {test_case.prompt_id} not found")
+        # Render the prompt with the input variables
+        rendered_prompt = prompt.render(**test_case.input_vars)
+        # Call the LLM with the rendered prompt
+        if asyncio.iscoroutinefunction(llm_callback):
+            output = await llm_callback(rendered_prompt, test_case.input_vars)
+        else:
+            output = llm_callback(rendered_prompt, test_case.input_vars)
+        # Determine if the test passed
+        passed = None
+        if test_case.expected_output:
+            passed = output.strip() == test_case.expected_output.strip()
+        # Calculate metrics if callbacks are provided
+        metrics = {}
+        if metrics_callbacks:
+            for metric_callback in metrics_callbacks:
+                metrics.update(metric_callback(output, test_case.expected_output or ""))
+        # Create and save the test result
+        test_result = TestResult(
+            test_case_id=test_case.id,
+            prompt_id=test_case.prompt_id,
+            prompt_version=prompt.version,
+            output=output,
+            passed=passed,
+            metrics=metrics
+        )
+        self.test_results[test_result.id] = test_result
+        self._save_test_result(test_result)
+        return test_result
+    async def run_test_cases(
+        self,
+        prompt_id: str,
+        llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
+        metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
+    ) -> List[TestResult]:
+        """Run all test cases for a prompt."""
+        test_cases = self.list_test_cases(prompt_id)
+        results = []
+        for test_case in test_cases:
+            result = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
+            results.append(result)
+        return results
+    async def run_ab_test(
+        self,
+        prompt_a_id: str,
+        prompt_b_id: str,
+        llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
+        metrics_callbacks: List[Callable[[str, str], Dict[str, float]]],
+        test_cases: Optional[List[str]] = None
+    ) -> ABTestResult:
+        """Run an A/B test with two prompts."""
+        prompt_a = self.prompt_manager.get(prompt_a_id)
+        prompt_b = self.prompt_manager.get(prompt_b_id)
+        if not prompt_a or not prompt_b:
+            raise ValueError("Both prompts must exist")
+        # Get test cases to use
+        if test_cases:
+            # Use specified test cases
+            test_case_objs = [self.get_test_case(tc_id) for tc_id in test_cases]
+            test_case_objs = [tc for tc in test_case_objs if tc]
+        else:
+            # Use all test cases for prompt A
+            test_case_objs = self.list_test_cases(prompt_a_id)
+        if not test_case_objs:
+            raise ValueError("No test cases found for the A/B test")
+        # Run test cases for both prompts
+        results_a = []
+        results_b = []
+        for test_case in test_case_objs:
+            # Create a copy of the test case for prompt B
+            if test_case.prompt_id != prompt_b_id:
+                test_case_b = self.create_test_case(
+                    prompt_id=prompt_b_id,
+                    input_vars=test_case.input_vars,
+                    expected_output=test_case.expected_output,
+                    name=f"Copy of {test_case.name} for B",
+                    description=test_case.description
+                )
+            else:
+                test_case_b = test_case
+            # Run the test cases
+            result_a = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
+            result_b = await self.run_test_case(test_case_b.id, llm_callback, metrics_callbacks)
+            results_a.append(result_a)
+            results_b.append(result_b)
+        # Calculate aggregate metrics
+        metrics_a = self._aggregate_metrics([r.metrics for r in results_a])
+        metrics_b = self._aggregate_metrics([r.metrics for r in results_b])
+        # Determine winner
+        winner = self._determine_winner(metrics_a, metrics_b)
+        # Create and save the A/B test result
+        ab_test_result = ABTestResult(
+            prompt_a_id=prompt_a_id,
+            prompt_b_id=prompt_b_id,
+            prompt_a_version=prompt_a.version,
+            prompt_b_version=prompt_b.version,
+            metrics_a=metrics_a,
+            metrics_b=metrics_b,
+            winner=winner
+        )
+        self.ab_test_results[ab_test_result.id] = ab_test_result
+        self._save_ab_test_result(ab_test_result)
+        return ab_test_result
+    def _aggregate_metrics(self, metrics_list: List[Dict[str, float]]) -> Dict[str, float]:
+        """Aggregate metrics from multiple test results."""
+        if not metrics_list:
+            return {}
+        aggregated = {}
+        for key in metrics_list[0].keys():
+            values = [m.get(key, 0) for m in metrics_list]
+            aggregated[key] = sum(values) / len(values)  # Simple average
+        return aggregated
+    def _determine_winner(self, metrics_a: Dict[str, float], metrics_b: Dict[str, float]) -> Optional[str]:
+        """Determine winner of A/B test based on metrics."""
+        if not metrics_a or not metrics_b:
+            return None
+        # Assume higher values are better for all metrics
+        a_wins = 0
+        b_wins = 0
+        for key in metrics_a.keys():
+            if key in metrics_b:
+                if metrics_a[key] > metrics_b[key]:
+                    a_wins += 1
+                elif metrics_b[key] > metrics_a[key]:
+                    b_wins += 1
+        if a_wins > b_wins:
+            return "A"
+        elif b_wins > a_wins:
+            return "B"
+        else:
+            return None  # Tie
+    def get_test_results(self, test_case_id: Optional[str] = None, prompt_id: Optional[str] = None) -> List[TestResult]:
+        """Get test results, optionally filtered by test case ID or prompt ID."""
+        results = list(self.test_results.values())
+        if test_case_id:
+            results = [r for r in results if r.test_case_id == test_case_id]
+        if prompt_id:
+            results = [r for r in results if r.prompt_id == prompt_id]
+        return sorted(results, key=lambda r: r.created_at, reverse=True)
+    def get_ab_test_results(self, prompt_id: Optional[str] = None) -> List[ABTestResult]:
+        """Get A/B test results, optionally filtered by prompt ID."""
+        results = list(self.ab_test_results.values())
+        if prompt_id:
+            results = [r for r in results if r.prompt_a_id == prompt_id or r.prompt_b_id == prompt_id]
+        return sorted(results, key=lambda r: r.created_at, reverse=True)

promptlab/core/version_control.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import json
+import datetime
+from typing import Dict, List, Optional, Any
+from .prompt_manager import Prompt, PromptManager
+class PromptVersion:
+    """Represents a specific version of a prompt."""
+    def __init__(
+        self,
+        prompt_id: str,
+        version: int,
+        content: str,
+        metadata: Optional[Dict[str, Any]] = None,
+        commit_message: Optional[str] = None
+    ):
+        self.prompt_id = prompt_id
+        self.version = version
+        self.content = content
+        self.metadata = metadata or {}
+        self.commit_message = commit_message or ""
+        self.created_at = datetime.datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert version to dictionary."""
+        return {
+            "prompt_id": self.prompt_id,
+            "version": self.version,
+            "content": self.content,
+            "metadata": self.metadata,
+            "commit_message": self.commit_message,
+            "created_at": self.created_at
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion":
+        """Create version from dictionary."""
+        return cls(
+            prompt_id=data["prompt_id"],
+            version=data["version"],
+            content=data["content"],
+            metadata=data.get("metadata", {}),
+            commit_message=data.get("commit_message", "")
+        )
+class VersionControl:
+    """Manages versioning for prompts."""
+    def __init__(self, prompt_manager: PromptManager):
+        self.prompt_manager = prompt_manager
+        self.storage_path = os.path.join(prompt_manager.storage_path, "versions")
+        os.makedirs(self.storage_path, exist_ok=True)
+        self.versions: Dict[str, Dict[int, PromptVersion]] = {}
+        self._load_versions()
+    def _load_versions(self) -> None:
+        """Load versions from storage."""
+        if not os.path.exists(self.storage_path):
+            os.makedirs(self.storage_path)
+            return
+        for prompt_id_dir in os.listdir(self.storage_path):
+            prompt_dir = os.path.join(self.storage_path, prompt_id_dir)
+            if os.path.isdir(prompt_dir):
+                self.versions[prompt_id_dir] = {}
+                for filename in os.listdir(prompt_dir):
+                    if filename.endswith(".json"):
+                        with open(os.path.join(prompt_dir, filename), "r") as f:
+                            version_data = json.load(f)
+                            version = PromptVersion.from_dict(version_data)
+                            self.versions[prompt_id_dir][version.version] = version
+    def _save_version(self, version: PromptVersion) -> None:
+        """Save version to storage."""
+        prompt_dir = os.path.join(self.storage_path, version.prompt_id)
+        os.makedirs(prompt_dir, exist_ok=True)
+        version_path = os.path.join(prompt_dir, f"v{version.version}.json")
+        with open(version_path, "w") as f:
+            json.dump(version.to_dict(), f, indent=2)
+    def commit(
+        self,
+        prompt_id: str,
+        commit_message: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> Optional[PromptVersion]:
+        """Create a new version of a prompt."""
+        prompt = self.prompt_manager.get(prompt_id)
+        if not prompt:
+            return None
+        # Initialize versions dict for this prompt if it doesn't exist
+        if prompt_id not in self.versions:
+            self.versions[prompt_id] = {}
+        # Get the highest version number for this prompt
+        current_versions = self.versions.get(prompt_id, {})
+        next_version = max(current_versions.keys(), default=0) + 1
+        # Create the new version
+        version = PromptVersion(
+            prompt_id=prompt_id,
+            version=next_version,
+            content=prompt.content,
+            metadata=metadata or {},
+            commit_message=commit_message
+        )
+        # Save the new version
+        self.versions[prompt_id][next_version] = version
+        self._save_version(version)
+        # Update the prompt's version number
+        prompt.version = next_version
+        self.prompt_manager._save_prompt(prompt)
+        return version
+    def get_version(self, prompt_id: str, version: int) -> Optional[PromptVersion]:
+        """Get a specific version of a prompt."""
+        return self.versions.get(prompt_id, {}).get(version)
+    def list_versions(self, prompt_id: str) -> List[PromptVersion]:
+        """List all versions of a prompt."""
+        versions = self.versions.get(prompt_id, {})
+        return sorted(versions.values(), key=lambda v: v.version)
+    def checkout(self, prompt_id: str, version: int) -> Optional[Prompt]:
+        """Checkout a specific version of a prompt."""
+        prompt = self.prompt_manager.get(prompt_id)
+        version_obj = self.get_version(prompt_id, version)
+        if not prompt or not version_obj:
+            return None
+        prompt.content = version_obj.content
+        prompt.version = version
+        prompt.updated_at = datetime.datetime.now().isoformat()
+        self.prompt_manager._save_prompt(prompt)
+        return prompt
+    def diff(self, prompt_id: str, version1: int, version2: int) -> Dict[str, Any]:
+        """Compare two versions of a prompt."""
+        v1 = self.get_version(prompt_id, version1)
+        v2 = self.get_version(prompt_id, version2)
+        if not v1 or not v2:
+            return {}
+        import difflib
+        d = difflib.Differ()
+        diff = list(d.compare(v1.content.splitlines(), v2.content.splitlines()))
+        return {
+            "version1": version1,
+            "version2": version2,
+            "diff": diff
+        }

promptlab/examples/__init__.py ADDED Viewed

File without changes

promptlab/examples/ab_testing.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+A/B testing example for PromptLab.
+This example demonstrates how to use PromptLab to perform A/B testing
+on different prompt variations to find the most effective one.
+"""
+import asyncio
+import os
+from promptlab import PromptManager, PromptTesting
+async def llm_callback(prompt, vars):
+    """
+    Simulated LLM callback for testing.
+    In a real scenario, this would call an actual LLM API.
+    """
+    # Simple simulation - return different responses based on prompt content
+    if "concise" in prompt.lower():
+        return "This is a short, concise response."
+    elif "detailed" in prompt.lower():
+        return "This is a much more detailed response that provides additional context and information about the query. It elaborates on various aspects and provides a comprehensive answer."
+    else:
+        return "Default response."
+async def main():
+    # Initialize the prompt manager with a custom storage path
+    storage_path = os.path.join(os.getcwd(), "promptlab_storage")
+    prompt_manager = PromptManager(storage_path)
+    # Initialize testing
+    testing = PromptTesting(prompt_manager)
+    # Create two prompt variations
+    prompt_a = prompt_manager.create(
+        content="Provide a concise answer to the following question: {question}",
+        name="Concise Prompt",
+        description="A prompt that asks for concise answers",
+        tags=["concise", "test"]
+    )
+    prompt_b = prompt_manager.create(
+        content="Provide a detailed and comprehensive answer to the following question: {question}",
+        name="Detailed Prompt",
+        description="A prompt that asks for detailed answers",
+        tags=["detailed", "test"]
+    )
+    print(f"Created prompt A with ID: {prompt_a.id}")
+    print(f"Created prompt B with ID: {prompt_b.id}")
+    # Create test cases
+    test_cases = []
+    questions = [
+        "What is machine learning?",
+        "How does a neural network work?",
+        "What are the benefits of version control?"
+    ]
+    for i, question in enumerate(questions):
+        test_case = testing.create_test_case(
+            prompt_id=prompt_a.id,
+            input_vars={"question": question},
+            name=f"Test Case {i+1}",
+            description=f"Test case for question: {question}"
+        )
+        test_cases.append(test_case.id)
+    print(f"Created {len(test_cases)} test cases")
+    # Define metrics callbacks
+    def length_metric(output, expected):
+        """Measure output length as a metric."""
+        return {"length": len(output) / 1000}  # Normalize to 0-1 range
+    def keyword_metric(output, expected):
+        """Check for presence of keywords."""
+        keywords = ["machine", "learning", "neural", "network", "version", "control"]
+        matches = sum(1 for k in keywords if k.lower() in output.lower())
+        return {"keyword_matches": matches / len(keywords)}
+    # Run A/B test
+    ab_result = await testing.run_ab_test(
+        prompt_a_id=prompt_a.id,
+        prompt_b_id=prompt_b.id,
+        llm_callback=llm_callback,
+        metrics_callbacks=[length_metric, keyword_metric],
+        test_cases=test_cases
+    )
+    print(f"A/B test completed with ID: {ab_result.id}")
+    print(f"Prompt A metrics: {ab_result.metrics_a}")
+    print(f"Prompt B metrics: {ab_result.metrics_b}")
+    print(f"Winner: {ab_result.winner or 'Tie'}")
+    # List all test results
+    results_a = testing.get_test_results(prompt_id=prompt_a.id)
+    results_b = testing.get_test_results(prompt_id=prompt_b.id)
+    print(f"Found {len(results_a)} test results for prompt A")
+    print(f"Found {len(results_b)} test results for prompt B")
+    # Display individual test results
+    print("\nSample outputs:")
+    for i, (result_a, result_b) in enumerate(zip(results_a[:3], results_b[:3])):
+        print(f"\nTest Case {i+1}:")
+        print("\nConcise prompt output:")
+        print(result_a.output)
+        print("\nDetailed prompt output:")
+        print(result_b.output)
+if __name__ == "__main__":
+    asyncio.run(main())

promptlab/examples/basic_usage.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Basic usage example for PromptLab.
+This example demonstrates the fundamental features of PromptLab
+including creating prompts, versioning, and rendering.
+"""
+import asyncio
+import os
+from promptlab import PromptManager, VersionControl
+async def main():
+    # Initialize the prompt manager with a custom storage path
+    storage_path = os.path.join(os.getcwd(), "promptlab_storage")
+    prompt_manager = PromptManager(storage_path)
+    # Initialize version control
+    version_control = VersionControl(prompt_manager)
+    # Create a basic prompt
+    basic_prompt = prompt_manager.create(
+        content="Hello, my name is {name} and I am a {occupation}.",
+        name="Introduction",
+        description="A simple introduction prompt",
+        tags=["basic", "introduction"]
+    )
+    print(f"Created prompt with ID: {basic_prompt.id}")
+    # Render the prompt with variables
+    rendered = basic_prompt.render(name="Alice", occupation="Data Scientist")
+    print(f"Rendered prompt: {rendered}")
+    # Create a more complex prompt
+    complex_prompt = prompt_manager.create(
+        content="""
+        System: {system_message}
+        User: {user_message}
+        Assistant:
+        """,
+        name="Chat Interaction",
+        description="A prompt for chat interactions",
+        tags=["chat", "interaction"]
+    )
+    print(f"Created complex prompt with ID: {complex_prompt.id}")
+    # Render the complex prompt
+    rendered = complex_prompt.render(
+        system_message="You are a helpful assistant.",
+        user_message="Can you help me with Python programming?"
+    )
+    print(f"Rendered complex prompt:\n{rendered}")
+    # Create a version
+    version = version_control.commit(
+        prompt_id=complex_prompt.id,
+        commit_message="Initial version"
+    )
+    print(f"Created version {version.version} for prompt {complex_prompt.id}")
+    # Update the prompt
+    complex_prompt = prompt_manager.update(
+        complex_prompt.id,
+        content="""
+        System: {system_message}
+        User: {user_message}
+        Think step by step:
+        {thinking}
+        Assistant:
+        """
+    )
+    print(f"Updated prompt with ID: {complex_prompt.id}")
+    # Create another version
+    version = version_control.commit(
+        prompt_id=complex_prompt.id,
+        commit_message="Added thinking step"
+    )
+    print(f"Created version {version.version} for prompt {complex_prompt.id}")
+    # List all versions
+    versions = version_control.list_versions(complex_prompt.id)
+    print(f"Found {len(versions)} versions for prompt {complex_prompt.id}:")
+    for v in versions:
+        print(f"Version: {v.version} | Created: {v.created_at} | Message: {v.commit_message}")
+    # Checkout a specific version
+    prompt = version_control.checkout(complex_prompt.id, 1)
+    print(f"Checked out version 1 for prompt {complex_prompt.id}")
+    print(f"Content:\n{prompt.content}")
+    # List all prompts
+    prompts = prompt_manager.list()
+    print(f"Found {len(prompts)} prompts:")
+    for p in prompts:
+        print(f"ID: {p.id} | Name: {p.name} | Tags: {', '.join(p.tags)}")
+if __name__ == "__main__":
+    asyncio.run(main())

promptlab/examples/evaluation_example.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Evaluation example for PromptLab.
+This example demonstrates how to use PromptLab's evaluation framework
+to measure the quality of prompts using various metrics.
+"""
+import asyncio
+import os
+from promptlab import PromptManager, Evaluator, ContainsKeywordsMetric, LengthMetric
+async def llm_callback(prompt, vars):
+    """
+    Simulated LLM callback for testing.
+    In a real scenario, this would call an actual LLM API.
+    """
+    # Simple simulation based on input text
+    text = vars.get("text", "")
+    if "code" in text.lower():
+        return "```python\ndef hello_world():\n    print('Hello, world!')\n```"
+    elif "list" in text.lower():
+        return "1. First item\n2. Second item\n3. Third item"
+    elif "summary" in text.lower():
+        return f"This is a summary of the text about {text.split()[0]}."
+    else:
+        return f"Response to: {text}"
+async def main():
+    # Initialize the prompt manager with a custom storage path
+    storage_path = os.path.join(os.getcwd(), "promptlab_storage")
+    prompt_manager = PromptManager(storage_path)
+    # Initialize evaluator
+    evaluator = Evaluator(prompt_manager)
+    # Create a prompt for evaluation
+    prompt = prompt_manager.create(
+        content="Please {action} the following text: {text}",
+        name="Dynamic Action Prompt",
+        description="A prompt that can perform different actions based on input",
+        tags=["action", "dynamic"]
+    )
+    print(f"Created prompt with ID: {prompt.id}")
+    # Register custom metrics
+    code_keywords = ContainsKeywordsMetric(
+        keywords=["def", "print", "function", "return"],
+        case_sensitive=False
+    )
+    evaluator.register_metric(code_keywords)
+    list_keywords = ContainsKeywordsMetric(
+        keywords=["1.", "2.", "3.", "item"],
+        case_sensitive=False
+    )
+    evaluator.register_metric(list_keywords)
+    length_metric = LengthMetric(min_length=10, max_length=500)
+    evaluator.register_metric(length_metric)
+    # Create test inputs for different actions
+    test_inputs = [
+        {"action": "write code for", "text": "a simple hello world function"},
+        {"action": "create a list of", "text": "three important items"},
+        {"action": "summarize", "text": "machine learning concepts in data science"},
+        {"action": "analyze", "text": "the impact of climate change on ecosystems"}
+    ]
+    # Run evaluation
+    evaluation_result = await evaluator.evaluate_prompt(
+        prompt_id=prompt.id,
+        inputs=test_inputs,
+        llm_callback=llm_callback
+    )
+    # Print evaluation results
+    print("\nEvaluation completed!")
+    print("\nAggregated metrics:")
+    for name, value in evaluation_result["aggregated_metrics"].items():
+        print(f"{name}: {value:.4f}")
+    print("\nIndividual results:")
+    for i, result in enumerate(evaluation_result["individual_results"]):
+        print(f"\nTest {i+1} ({result['input']['action']} {result['input']['text']}):")
+        print(f"Output: {result['output']}")
+        print("Metrics:")
+        for name, value in result["metrics"].items():
+            print(f"  {name}: {value:.4f}")
+if __name__ == "__main__":
+    asyncio.run(main())

promptlab/tests/__init__.py ADDED Viewed

File without changes

promptlab/tests/test_evaluation.py ADDED Viewed

File without changes

promptlab/tests/test_prompt_manager.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import unittest
+import os
+import shutil
+import tempfile
+from promptlab.core.prompt_manager import PromptManager, Prompt
+class TestPromptManager(unittest.TestCase):
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        self.prompt_manager = PromptManager(self.test_dir)
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir)
+    def test_create_prompt(self):
+        """Test creating a prompt."""
+        prompt = self.prompt_manager.create(
+            content="Test prompt {var}",
+            name="Test Prompt",
+            description="A test prompt",
+            tags=["test", "example"]
+        )
+        self.assertIsNotNone(prompt)
+        self.assertEqual(prompt.name, "Test Prompt")
+        self.assertEqual(prompt.content, "Test prompt {var}")
+        self.assertEqual(prompt.description, "A test prompt")
+        self.assertEqual(prompt.tags, ["test", "example"])
+    def test_get_prompt(self):
+        """Test getting a prompt."""
+        prompt = self.prompt_manager.create(
+            content="Test prompt",
+            name="Test Prompt"
+        )
+        retrieved = self.prompt_manager.get(prompt.id)
+        self.assertIsNotNone(retrieved)
+        self.assertEqual(retrieved.id, prompt.id)
+        self.assertEqual(retrieved.name, prompt.name)
+        self.assertEqual(retrieved.content, prompt.content)
+    def test_update_prompt(self):
+        """Test updating a prompt."""
+        prompt = self.prompt_manager.create(
+            content="Test prompt",
+            name="Test Prompt"
+        )
+        updated = self.prompt_manager.update(
+            prompt.id,
+            content="Updated prompt",
+            name="Updated Name"
+        )
+        self.assertEqual(updated.content, "Updated prompt")
+        self.assertEqual(updated.name, "Updated Name")
+        # Check that the update was persisted
+        retrieved = self.prompt_manager.get(prompt.id)
+        self.assertEqual(retrieved.content, "Updated prompt")
+        self.assertEqual(retrieved.name, "Updated Name")
+    def test_delete_prompt(self):
+        """Test deleting a prompt."""
+        prompt = self.prompt_manager.create(
+            content="Test prompt",
+            name="Test Prompt"
+        )
+        success = self.prompt_manager.delete(prompt.id)
+        self.assertTrue(success)
+        self.assertIsNone(self.prompt_manager.get(prompt.id))
+    def test_list_prompts(self):
+        """Test listing prompts."""
+        self.prompt_manager.create(
+            content="Test prompt 1",
+            name="Test Prompt 1",
+            tags=["test", "one"]
+        )
+        self.prompt_manager.create(
+            content="Test prompt 2",
+            name="Test Prompt 2",
+            tags=["test", "two"]
+        )
+        all_prompts = self.prompt_manager.list()
+        self.assertEqual(len(all_prompts), 2)
+        test_tag_prompts = self.prompt_manager.list(tags=["test"])
+        self.assertEqual(len(test_tag_prompts), 2)
+        one_tag_prompts = self.prompt_manager.list(tags=["one"])
+        self.assertEqual(len(one_tag_prompts), 1)
+        self.assertEqual(one_tag_prompts[0].name, "Test Prompt 1")
+    def test_render_prompt(self):
+        """Test rendering a prompt with variables."""
+        prompt = self.prompt_manager.create(
+            content="Hello, {name}! You are a {occupation}.",
+            name="Test Prompt"
+        )
+        rendered = prompt.render(name="Alice", occupation="Data Scientist")
+        self.assertEqual(rendered, "Hello, Alice! You are a Data Scientist.")
+if __name__ == "__main__":
+    unittest.main()

promptlab/tests/test_testing.py ADDED Viewed

File without changes

promptlab/tests/test_version_control.py ADDED Viewed

File without changes

promptlab/utils/__init__.py ADDED Viewed

File without changes

promptlab/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import Dict, List, Optional, Any, Union, Callable
+import re
+import numpy as np
+from difflib import SequenceMatcher
+def exact_match(generated: str, expected: str) -> float:
+    """Calculate exact match score (1.0 if exact match, 0.0 otherwise)."""
+    if not expected or not generated:
+        return 0.0
+    return 1.0 if generated.strip() == expected.strip() else 0.0
+def contains_all(generated: str, items: List[str], case_sensitive: bool = False) -> float:
+    """Check if generated text contains all items in the list."""
+    if not items:
+        return 0.0
+    if not case_sensitive:
+        generated = generated.lower()
+        items = [item.lower() for item in items]
+    matches = sum(1 for item in items if item in generated)
+    return matches / len(items)
+def similarity_score(str1: str, str2: str) -> float:
+    """Calculate string similarity using difflib."""
+    if not str1 or not str2:
+        return 0.0
+    return SequenceMatcher(None, str1, str2).ratio()
+def word_count(text: str) -> int:
+    """Count words in text."""
+    return len(re.findall(r'\w+', text))
+def length_ratio(generated: str, expected: str) -> float:
+    """Calculate ratio of generated text length to expected text length."""
+    if not expected:
+        return 0.0
+    gen_length = len(generated)
+    exp_length = len(expected)
+    # Avoid division by zero
+    if exp_length == 0:
+        return 0.0 if gen_length > 0 else 1.0
+    # Return value between 0 and 1, with 1 being perfect match
+    # and decreasing as the ratio diverges from 1
+    ratio = gen_length / exp_length
+    return min(ratio, 1/ratio) if ratio > 0 else 0.0
+def word_overlap(generated: str, expected: str) -> float:
+    """Calculate the word overlap between generated and expected text."""
+    if not expected or not generated:
+        return 0.0
+    gen_words = set(re.findall(r'\w+', generated.lower()))
+    exp_words = set(re.findall(r'\w+', expected.lower()))
+    if not exp_words:
+        return 0.0
+    intersection = gen_words.intersection(exp_words)
+    return len(intersection) / len(exp_words)
+def keyword_presence(text: str, keywords: List[str], weight: Optional[Dict[str, float]] = None) -> Dict[str, float]:
+    """Check for presence of keywords with optional weights."""
+    if not keywords:
+        return {"keyword_score": 0.0}
+    text = text.lower()
+    result = {}
+    total_weight = 0
+    weighted_score = 0
+    for keyword in keywords:
+        keyword_lower = keyword.lower()
+        presence = 1.0 if keyword_lower in text else 0.0
+        # Apply weight if provided
+        kw_weight = weight.get(keyword, 1.0) if weight else 1.0
+        total_weight += kw_weight
+        weighted_score += presence * kw_weight
+        result[f"keyword_{keyword}"] = presence
+    # Calculate overall weighted score
+    if total_weight > 0:
+        result["keyword_score"] = weighted_score / total_weight
+    else:
+        result["keyword_score"] = 0.0
+    return result
+class MetricsSet:
+    """A collection of evaluation metrics functions."""
+    def __init__(self):
+        self.metrics = {}
+    def add_metric(self, name: str, func: Callable, description: Optional[str] = None) -> None:
+        """Add a metric function to the set."""
+        self.metrics[name] = {
+            "function": func,
+            "description": description or ""
+        }
+    def evaluate(self, generated: str, expected: Optional[str] = None, **kwargs) -> Dict[str, float]:
+        """Evaluate all metrics on the given text."""
+        results = {}
+        for name, metric in self.metrics.items():
+            try:
+                # Different metrics may require different arguments
+                if expected is not None:
+                    if "keywords" in kwargs and name == "keyword_presence":
+                        result = metric["function"](generated, kwargs["keywords"])
+                    else:
+                        result = metric["function"](generated, expected)
+                else:
+                    result = metric["function"](generated)
+                # Handle both single values and dictionaries
+                if isinstance(result, dict):
+                    results.update(result)
+                else:
+                    results[name] = result
+            except Exception as e:
+                results[name] = 0.0
+                print(f"Error calculating metric {name}: {e}")
+        return results
+def create_default_metrics_set() -> MetricsSet:
+    """Create a MetricsSet with default metrics."""
+    metrics = MetricsSet()
+    metrics.add_metric(
+        "exact_match",
+        exact_match,
+        "Exact string match between expected and generated"
+    )
+    metrics.add_metric(
+        "similarity",
+        similarity_score,
+        "String similarity using difflib's SequenceMatcher"
+    )
+    metrics.add_metric(
+        "word_overlap",
+        word_overlap,
+        "Ratio of words in expected that appear in generated"
+    )
+    metrics.add_metric(
+        "length_ratio",
+        length_ratio,
+        "Ratio of generated text length to expected text length"
+    )
+    return metrics

promptlab/utils/storage.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import json
+import shutil
+from typing import Dict, Any, Optional, List
+class Storage:
+    """Handles persistent storage for PromptLab."""
+    def __init__(self, base_path: str):
+        self.base_path = base_path
+        os.makedirs(base_path, exist_ok=True)
+    def ensure_dir(self, dir_path: str) -> str:
+        """Ensure directory exists and return its path."""
+        full_path = os.path.join(self.base_path, dir_path)
+        os.makedirs(full_path, exist_ok=True)
+        return full_path
+    def save_json(self, dir_path: str, filename: str, data: Dict[str, Any]) -> str:
+        """Save data to a JSON file."""
+        dir_full_path = self.ensure_dir(dir_path)
+        file_path = os.path.join(dir_full_path, f"{filename}.json")
+        with open(file_path, "w") as f:
+            json.dump(data, f, indent=2)
+        return file_path
+    def load_json(self, dir_path: str, filename: str) -> Optional[Dict[str, Any]]:
+        """Load data from a JSON file."""
+        file_path = os.path.join(self.base_path, dir_path, f"{filename}.json")
+        if not os.path.exists(file_path):
+            return None
+        with open(file_path, "r") as f:
+            return json.load(f)
+    def list_files(self, dir_path: str, extension: Optional[str] = None) -> List[str]:
+        """List files in a directory, optionally filtered by extension."""
+        full_path = os.path.join(self.base_path, dir_path)
+        if not os.path.exists(full_path):
+            return []
+        files = os.listdir(full_path)
+        if extension:
+            return [f for f in files if f.endswith(extension)]
+        return files
+    def delete_file(self, dir_path: str, filename: str) -> bool:
+        """Delete a file."""
+        file_path = os.path.join(self.base_path, dir_path, filename)
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            return True
+        return False
+    def backup(self, backup_path: Optional[str] = None) -> str:
+        """Create a backup of the entire storage."""
+        if not backup_path:
+            backup_path = f"{self.base_path}_backup"
+        shutil.make_archive(backup_path, "zip", self.base_path)
+        return f"{backup_path}.zip"
+    def restore(self, backup_path: str) -> bool:
+        """Restore from a backup archive."""
+        if not os.path.exists(backup_path):
+            return False
+        shutil.rmtree(self.base_path, ignore_errors=True)
+        os.makedirs(self.base_path, exist_ok=True)
+        shutil.unpack_archive(backup_path, self.base_path)
+        return True

promptlab/utils/templating.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import re
+import json
+from typing import Dict, Any, List, Optional, Union, Callable
+from string import Formatter
+class TemplateError(Exception):
+    """Exception raised for errors in template rendering."""
+    pass
+class PromptTemplate:
+    """Advanced templating system for prompts."""
+    def __init__(self, template: str):
+        self.template = template
+        self._validate_template()
+    def _validate_template(self) -> None:
+        """Validate template syntax."""
+        try:
+            # Check for basic placeholder syntax
+            list(Formatter().parse(self.template))
+            # Check for conditional syntax
+            self._validate_conditionals()
+            # Check for loop syntax
+            self._validate_loops()
+        except Exception as e:
+            raise TemplateError(f"Invalid template syntax: {str(e)}")
+    def _validate_conditionals(self) -> None:
+        """Validate conditional blocks in the template."""
+        # Simple validation to ensure if/endif blocks match
+        if_count = len(re.findall(r'\{\s*if\s+.*?\s*\}', self.template))
+        endif_count = len(re.findall(r'\{\s*endif\s*\}', self.template))
+        if if_count != endif_count:
+            raise TemplateError(f"Mismatched conditional blocks: {if_count} 'if' and {endif_count} 'endif'")
+    def _validate_loops(self) -> None:
+        """Validate loop blocks in the template."""
+        # Simple validation to ensure for/endfor blocks match
+        for_count = len(re.findall(r'\{\s*for\s+.*?\s*\}', self.template))
+        endfor_count = len(re.findall(r'\{\s*endfor\s*\}', self.template))
+        if for_count != endfor_count:
+            raise TemplateError(f"Mismatched loop blocks: {for_count} 'for' and {endfor_count} 'endfor'")
+    def _render_conditionals(self, template: str, variables: Dict[str, Any]) -> str:
+        """Process conditional blocks in the template."""
+        # Handle if-else-endif blocks
+        pattern = r'\{\s*if\s+(.*?)\s*\}(.*?)(?:\{\s*else\s*\}(.*?))?\{\s*endif\s*\}'
+        def replace_conditional(match):
+            condition = match.group(1)
+            if_block = match.group(2)
+            else_block = match.group(3) or ""
+            # Evaluate condition
+            try:
+                # Replace variables in condition
+                for var_name, var_value in variables.items():
+                    if isinstance(var_value, str):
+                        # For strings, replace with quoted value
+                        condition = condition.replace(var_name, f'"{var_value}"')
+                    else:
+                        # For other types, replace directly
+                        condition = condition.replace(var_name, str(var_value))
+                result = eval(condition, {"__builtins__": {}}, variables)
+                return if_block if result else else_block
+            except Exception as e:
+                raise TemplateError(f"Error evaluating condition '{condition}': {str(e)}")
+        # Use re.DOTALL to match across multiple lines
+        return re.sub(pattern, replace_conditional, template, flags=re.DOTALL)
+    def _render_loops(self, template: str, variables: Dict[str, Any]) -> str:
+        """Process loop blocks in the template."""
+        # Handle for loops
+        pattern = r'\{\s*for\s+(.*?)\s+in\s+(.*?)\s*\}(.*?)\{\s*endfor\s*\}'
+        def replace_loop(match):
+            var_name = match.group(1)
+            iterable_expr = match.group(2)
+            loop_body = match.group(3)
+            try:
+                # Get the iterable from variables
+                if iterable_expr in variables and hasattr(variables[iterable_expr], '__iter__'):
+                    iterable = variables[iterable_expr]
+                else:
+                    # Try to evaluate the expression
+                    iterable = eval(iterable_expr, {"__builtins__": {}}, variables)
+                if not hasattr(iterable, '__iter__'):
+                    raise TemplateError(f"'{iterable_expr}' is not iterable")
+                # Process the loop body for each item
+                result = []
+                for item in iterable:
+                    # Create a copy of variables with loop variable
+                    loop_vars = variables.copy()
+                    loop_vars[var_name] = item
+                    # Process the loop body with the new variables
+                    body_content = loop_body
+                    for k, v in loop_vars.items():
+                        placeholder = f"{{{k}}}"
+                        if placeholder in body_content:
+                            body_content = body_content.replace(placeholder, str(v))
+                    result.append(body_content)
+                return "".join(result)
+            except Exception as e:
+                raise TemplateError(f"Error processing loop '{match.group(0)}': {str(e)}")
+        # Use re.DOTALL to match across multiple lines
+        return re.sub(pattern, replace_loop, template, flags=re.DOTALL)
+    def _apply_filters(self, value: Any, filters: List[str]) -> str:
+        """Apply filters to a value."""
+        result = value
+        for filter_name in filters:
+            if filter_name == "upper":
+                result = str(result).upper()
+            elif filter_name == "lower":
+                result = str(result).lower()
+            elif filter_name == "title":
+                result = str(result).title()
+            elif filter_name == "capitalize":
+                result = str(result).capitalize()
+            elif filter_name == "strip":
+                result = str(result).strip()
+            elif filter_name == "json":
+                result = json.dumps(result)
+            else:
+                raise TemplateError(f"Unknown filter: {filter_name}")
+        return result
+    def _render_variables(self, template: str, variables: Dict[str, Any]) -> str:
+        """Replace variables in the template with their values."""
+        result = template
+        # Process variables with filters
+        pattern = r'\{(.*?)(?:\|(.*?))?\}'
+        def replace_var(match):
+            var_expr = match.group(1).strip()
+            filters_expr = match.group(2)
+            # Extract filters
+            filters = []
+            if filters_expr:
+                filters = [f.strip() for f in filters_expr.split('|')]
+            try:
+                # Simple variable
+                if var_expr in variables:
+                    value = variables[var_expr]
+                else:
+                    # Try to evaluate as an expression
+                    try:
+                        value = eval(var_expr, {"__builtins__": {}}, variables)
+                    except:
+                        return match.group(0)  # Keep as is if evaluation fails
+                # Apply filters
+                return str(self._apply_filters(value, filters))
+            except Exception as e:
+                raise TemplateError(f"Error processing variable '{var_expr}': {str(e)}")
+        return re.sub(pattern, replace_var, result)
+    def render(self, **kwargs) -> str:
+        """Render the template with provided variables."""
+        result = self.template
+        # Process templates in multiple passes
+        # First, handle conditional blocks
+        result = self._render_conditionals(result, kwargs)
+        # Then, handle loops
+        result = self._render_loops(result, kwargs)
+        # Finally, handle simple variable substitution
+        result = self._render_variables(result, kwargs)
+        return result
+class PromptTemplateRegistry:
+    """Registry for prompt templates."""
+    def __init__(self):
+        self.templates: Dict[str, PromptTemplate] = {}
+    def register(self, name: str, template: Union[str, PromptTemplate]) -> None:
+        """Register a template."""
+        if isinstance(template, str):
+            template = PromptTemplate(template)
+        self.templates[name] = template
+    def get(self, name: str) -> Optional[PromptTemplate]:
+        """Get a template by name."""
+        return self.templates.get(name)
+    def render(self, name: str, **kwargs) -> str:
+        """Render a template by name."""
+        template = self.get(name)
+        if not template:
+            raise ValueError(f"Template '{name}' not found")
+        return template.render(**kwargs)
+    def list_templates(self) -> List[str]:
+        """List all registered templates."""
+        return list(self.templates.keys())
+# Create a singleton instance
+template_registry = PromptTemplateRegistry()
+# Register some common templates
+template_registry.register(
+    "basic_completion",
+    """
+    {system_message}
+    {user_message}
+    """
+)
+template_registry.register(
+    "chat_template",
+    """
+    {system_message}
+    {for message in conversation}
+    {if message.role == "user"}Human: {message.content}
+    {else}Assistant: {message.content}
+    {endif}
+    {endfor}
+    """
+)
+template_registry.register(
+    "few_shot",
+    """
+    {system_message}
+    Here are some examples:
+    {for example in examples}
+    Input: {example.input}
+    Output: {example.output}
+    {endfor}
+    Input: {input}
+    Output:
+    """
+)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "promptlab"
+version = "0.1.0"
+description = "A comprehensive LLM Prompt Management System"
+readme = "README.md"
+requires-python = ">=3.7"
+license = {text = "MIT"}
+keywords = ["llm", "prompt engineering", "nlp", "machine learning"]
+authors = [
+    {name = "Biswanath Roul"}
+]
+maintainers = [
+    {name = "Biswanath Roul"}
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "numpy>=1.20.0",
+]
+[project.urls]
+"Homepage" = "https://github.com/biswanathroul/promptlab"
+"Bug Tracker" = "https://github.com/biswanathroul/promptlab/issues"
+"Documentation" = "https://github.com/biswanathroul/promptlab/wiki"
+"Source Code" = "https://github.com/biswanathroul/promptlab"
+[project.scripts]
+promptlab = "promptlab.cli.commands:main"
+[tool.setuptools]
+packages = ["promptlab", "promptlab.core", "promptlab.cli", "promptlab.utils"]