Spaces:

whackthejacker
/

DataHubHub

Paused

App Files Files Community

whackthejacker commited on Mar 2

Commit

43b66f1

verified ·

1 Parent(s): fbdb6f7

Upload 34 files

Browse files

Files changed (34) hide show

.env +1 -0
.gitattributes +1 -0
.github/workflows/action.yml +41 -0
.github/workflows/huggingface-workflow.yml +48 -0
.gitignore +13 -0
.replit +43 -0
.streamlit/config.toml +9 -0
.streamlit/secrets.toml +7 -0
README.md +87 -2
app.py +11 -0
assets/custom.css +59 -0
attached_assets/Pasted-Below-is-a-design-proposal-for-a-Hugging-Face-based-system-that-lets-users-fine-tune-a-code-generati-1740904225626.txt +116 -0
attached_assets/Pasted-For-a-robust-foundation-you-ll-want-to-configure-a-set-of-tools-that-catch-errors-early-maintain-c-1740904212802.txt +19 -0
attached_assets/Pasted-For-a-robust-foundation-you-ll-want-to-configure-a-set-of-tools-that-catch-errors-early-maintain-c-1740906031222.txt +19 -0
components/code_quality.py +347 -0
components/dataset_preview.py +75 -0
components/dataset_statistics.py +149 -0
components/dataset_uploader.py +113 -0
components/dataset_validation.py +181 -0
components/dataset_version_control.py +276 -0
components/dataset_visualization.py +502 -0
components/fine_tuning/__init__.py +3 -0
components/fine_tuning/finetune_ui.py +529 -0
components/fine_tuning/model_interface.py +228 -0
generated-icon.png +3 -0
huggingface-spacefile +8 -0
main.py +546 -0
pyproject.toml +33 -0
replit.nix +18 -0
test_app.py +105 -0
utils/dataset_utils.py +167 -0
utils/huggingface_integration.py +99 -0
utils/smolagents_integration.py +211 -0
uv.lock +0 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ PYTHONPATH=.

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+generated-icon.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/action.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: Huggingface Login
+description: "Login to Huggingface using token"
+author: osbm
+branding:
+  icon: server
+  color: yellow
+inputs:
+  username:
+    description: "Huggingface Username"
+    required: true
+  key:
+    description: "Huggingface token"
+    required: true
+  add_to_git_credentials:
+    description: "Add to git credentials"
+    required: false
+    default: "false"
+runs:
+  using: "composite"
+  steps:
+    - name: Install huggingface-hub
+      shell: bash
+      run: |
+        pip install huggingface-hub
+    - name: Login to Huggingface
+      shell: bash
+      run: |
+        mkdir -p ~/.cache/huggingface
+        echo "${{ inputs.key }}" > ~/.cache/huggingface/token
+    - name: Add to git credentials
+      shell: bash
+      if: inputs.add_to_git_credentials == 'true'
+      run: |
+        git config --global credential.helper store
+        git config --global credential.https://huggingface.co.username ${{ inputs.username }}
+        git config --global credential.https://huggingface.co.password ${{ inputs.key }}

.github/workflows/huggingface-workflow.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+name: Hugging Face Space Interaction
+on:
+  push:
+    branches:
+      - main
+jobs:
+  interact-with-space:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install huggingface_hub requests
+      - name: Login to Hugging Face
+        run: echo "${{ secrets.HF_TOKEN }}" | huggingface-cli login --token
+      - name: Example interaction with Space
+        run: |
+          python -c "
+          import requests
+          import os
+          HF_TOKEN = os.environ.get('HF_TOKEN')
+          headers = {'Authorization': f'Bearer {HF_TOKEN}'}
+          API_URL = 'YOUR_SPACE_API_URL' # Replace with your Space's API URL.
+          payload = {'inputs': 'Your input data'}
+          response = requests.post(API_URL, headers=headers, json=payload)
+          if response.status_code == 200:
+              print(response.json())
+          else:
+              print(f'Error: {response.status_code}, {response.text}')
+          "
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+__pycache__/
+*.py[cod]
+*$py.class
+.env
+.venv
+env/
+venv/
+ENV/
+database/data/*.db
+fine_tuned_models/
+.streamlit/secrets.toml
+.ipynb_checkpoints/
+.DS_Store

.replit ADDED Viewed

	@@ -0,0 +1,43 @@

+modules = ["python-3.11"]
+[nix]
+channel = "stable-24_05"
+[deployment]
+deploymentTarget = "autoscale"
+run = ["sh", "-c", "python -m streamlit run DataHubHub/app.py --server.address=0.0.0.0 --server.port=5000 --server.headless=true --server.enableCORS=false --server.enableXsrfProtection=false"]
+[workflows]
+runButton = "Project"
+[[workflows.workflow]]
+name = "Project"
+mode = "parallel"
+author = "agent"
+[[workflows.workflow.tasks]]
+task = "workflow.run"
+args = "Streamlit Server"
+[[workflows.workflow]]
+name = "Streamlit Server"
+author = "agent"
+[workflows.workflow.metadata]
+agentRequireRestartOnSave = false
+[[workflows.workflow.tasks]]
+task = "packager.installForAll"
+[[workflows.workflow.tasks]]
+task = "shell.exec"
+args = "python -m streamlit run DataHubHub/app.py --server.address=0.0.0.0 --server.port=5000 --server.headless=true --server.enableCORS=false --server.enableXsrfProtection=false"
+waitForPort = 5000
+[[ports]]
+localPort = 5000
+externalPort = 5000
+[[ports]]
+localPort = 8501
+externalPort = 80

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,9 @@

+[server]
+headless = true
+address = "0.0.0.0"
+port = 5000
+enableCORS = true
+enableXsrfProtection = false
+[browser]
+gatherUsageStats = false

.streamlit/secrets.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Secrets configuration for Hugging Face
+# This file is for demonstration purposes only
+# Replace with actual API keys if needed
+[huggingface]
+# Add your Hugging Face API token here if needed
+# hf_token = "YOUR_HF_TOKEN"

README.md CHANGED Viewed

@@ -1,13 +1,98 @@
 ---
 title: DataHubHub
 emoji: ⚡
-colorFrom: gray
 colorTo: indigo
 sdk: streamlit
 sdk_version: 1.42.2
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: DataHubHub
 emoji: ⚡
+colorFrom: red
 colorTo: indigo
 sdk: streamlit
 sdk_version: 1.42.2
 app_file: app.py
 pinned: false
 license: apache-2.0
+language: en
 ---
+# ML Dataset & Code Generation Manager
+A comprehensive platform for ML dataset management and code generation with Hugging Face integration.
+## Features
+- **Dataset Management**: Upload, explore, and manage machine learning datasets
+- **Data Visualization**: Visualize dataset statistics and distributions
+- **Code Generation**: Fine-tune models for code generation tasks
+- **Code Quality Tools**: Improve code quality with integrated formatters, linters, and type checkers
+## Technology Stack
+- **Frontend**: Streamlit
+- **Backend**: Python
+- **Database**: SQLite (via SQLAlchemy)
+- **ML Integration**: Hugging Face Transformers, Datasets
+- **Visualization**: Plotly, Matplotlib
+## Project Structure
+```
+.
+├── app.py                     # Main application entry point
+├── components/                # UI components
+│   ├── code_quality.py        # Code quality tools
+│   ├── dataset_preview.py     # Dataset preview component
+│   ├── dataset_statistics.py  # Dataset statistics component
+│   ├── dataset_uploader.py    # Dataset upload component
+│   ├── dataset_validation.py  # Dataset validation component
+│   ├── dataset_visualization.py # Dataset visualization component
+│   └── fine_tuning/           # Fine-tuning components
+│       ├── finetune_ui.py     # Fine-tuning UI
+│       └── model_interface.py # Model interface
+├── database/                  # Database configuration
+│   ├── models.py              # Database models
+│   └── operations.py          # Database operations
+├── utils/                     # Utility functions
+│   ├── dataset_utils.py       # Dataset utilities
+│   ├── huggingface_integration.py # Hugging Face integration
+│   └── smolagents_integration.py # SmolaAgents integration
+└── assets/                    # Static assets
+```
+## Deployment
+This application is designed to be deployed as a Hugging Face Space.
+### Hugging Face Space Deployment
+1. Fork this repository
+2. Create a new Hugging Face Space
+3. Connect the forked repository to your Space
+4. The application will be deployed automatically
+### Local Development
+1. Clone the repository
+2. Install dependencies:
+   ```
+   pip install streamlit pandas numpy plotly matplotlib scikit-learn SQLAlchemy huggingface-hub datasets transformers torch
+   ```
+3. Run the application:
+   ```
+   streamlit run app.py
+   ```
+## Configuration
+- `.streamlit/config.toml`: Streamlit configuration
+- `.streamlit/secrets.toml`: Secrets and API keys
+- `huggingface-spacefile`: Hugging Face Space configuration
+## API Keys
+To use the Hugging Face integration features, add your Hugging Face API token to `.streamlit/secrets.toml`:
+```toml
+[huggingface]
+hf_token = "YOUR_HF_TOKEN"
+```
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+ML Dataset & Code Generation Manager - Streamlit Application
+This is the main entry point for the Streamlit application.
+"""
+# Import from main.py
+from main import main
+# Execute the main function
+if __name__ == "__main__":
+    main()

assets/custom.css ADDED Viewed

	@@ -0,0 +1,59 @@

+            /* Custom styles for ML Dataset & Code Generation Manager */
+            @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Space+Grotesk:wght@500;700&display=swap');
+            h1, h2, h3, h4, h5, h6 {
+                font-family: 'Space Grotesk', sans-serif;
+                font-weight: 700;
+                color: #1A1C1F;
+            }
+            body {
+                font-family: 'Inter', sans-serif;
+                color: #1A1C1F;
+                background-color: #F8F9FA;
+            }
+            .stButton button {
+                background-color: #2563EB;
+                color: white;
+                border-radius: 4px;
+                border: none;
+                padding: 0.5rem 1rem;
+                font-weight: 600;
+            }
+            .stButton button:hover {
+                background-color: #1D4ED8;
+            }
+            /* Card styling */
+            .card {
+                background-color: white;
+                border-radius: 8px;
+                padding: 1.5rem;
+                box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+                margin-bottom: 1rem;
+            }
+            /* Accent colors */
+            .accent-primary {
+                color: #2563EB;
+            }
+            .accent-secondary {
+                color: #84919A;
+            }
+            .accent-success {
+                color: #10B981;
+            }
+            .accent-warning {
+                color: #F59E0B;
+            }
+            .accent-danger {
+                color: #EF4444;
+            }

attached_assets/Pasted-Below-is-a-design-proposal-for-a-Hugging-Face-based-system-that-lets-users-fine-tune-a-code-generati-1740904225626.txt ADDED Viewed

	@@ -0,0 +1,116 @@

+Below is a design proposal for a Hugging Face–based system that lets users fine-tune a code generation model via a simple Streamlit interface.
+Overview:
+	1.	Model & Library Setup:
+      •   Use a pre-trained code generation model (e.g., CodeT5 or CodeT5-base) from Hugging Face.
+      •   Leverage the Hugging Face Transformers and Datasets libraries together with the Hugging Face Trainer API to perform fine-tuning.
+	2.	Streamlit Interface:
+      •   Input Section: Users can upload a small dataset (e.g., a CSV file with code and target comments) or manually enter a few fine-tuning examples.
+      •   Hyperparameter Controls: Sliders or input boxes for settings like learning rate, number of epochs, batch size, and maybe even a choice of optimizer.
+      •   Execution Controls: Buttons to start fine-tuning and to monitor training progress (using, for example, real-time logging or a progress bar).
+      •   Output Section: Display training metrics (loss curves, evaluation scores) and allow users to run inference on new prompts once fine-tuning completes.
+	3.	Back-end Process:
+      •   When the user initiates fine-tuning, the uploaded dataset is preprocessed (tokenization using the model’s tokenizer).
+      •   A Trainer object is configured with the user-specified hyperparameters.
+      •   Fine-tuning is launched (this can run in a background thread or via caching intermediate results).
+      •   Once training is complete, the updated model can be saved to disk (or even directly loaded into the interface for inference).
+	4.	Deployment & Reproducibility:
+      •   The whole pipeline (data upload, preprocessing, training, evaluation, and inference) should be reproducible.
+      •   Optionally, support saving the fine-tuned model and the training configuration to allow users to share their work.
+Example Code Snippet (Simplified):
+Below is a simplified version of what the Streamlit app might look like. (Note: In a production setup, you would want proper error handling and asynchronous processing.)
+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
+from datasets import load_dataset, Dataset
+import torch
+# Title
+st.title("Fine-Tune Code Generation Model with Hugging Face & Streamlit")
+# Sidebar: Hyperparameters
+st.sidebar.header("Training Hyperparameters")
+learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 5e-5, 2e-5, 1e-6)
+epochs = st.sidebar.number_input("Epochs", 1, 10, 3)
+batch_size = st.sidebar.number_input("Batch Size", 4, 32, 8)
+# Upload your fine-tuning data: CSV file with columns "input" and "target"
+uploaded_file = st.file_uploader("Upload your fine-tuning dataset (CSV)", type="csv")
+if uploaded_file is not None:
+    import pandas as pd
+    df = pd.read_csv(uploaded_file)
+    st.write("Dataset preview:", df.head())
+    # Convert to Hugging Face Dataset
+    dataset = Dataset.from_pandas(df)
+else:
+    st.info("Please upload a CSV dataset with columns 'input' and 'target'.")
+# Model selection
+model_name = st.selectbox("Choose a model", ["Salesforce/codet5-base"])
+# Load model and tokenizer
+@st.cache_resource(show_spinner=False)
+def load_model_and_tokenizer(name):
+    tokenizer = AutoTokenizer.from_pretrained(name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(name)
+    return tokenizer, model
+tokenizer, model = load_model_and_tokenizer(model_name)
+# Preprocess function for tokenization
+def preprocess_function(examples):
+    inputs = [f"translate code to comment: {ex}" for ex in examples["input"]]
+    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(examples["target"], max_length=64, truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+if uploaded_file is not None:
+    tokenized_dataset = dataset.map(preprocess_function, batched=True)
+    # Setup training arguments
+    training_args = TrainingArguments(
+        output_dir="./results",
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        learning_rate=learning_rate,
+        logging_steps=10,
+        logging_dir='./logs',
+        report_to="none"
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset,
+    )
+    if st.button("Start Fine-Tuning"):
+        st.info("Fine-tuning started... This might take a while.")
+        trainer.train()
+        st.success("Fine-tuning complete!")
+        # Save the model to disk (or load it for inference)
+        model.save_pretrained("fine_tuned_model")
+        tokenizer.save_pretrained("fine_tuned_model")
+        st.write("Model saved to 'fine_tuned_model'.")
+        # Option to run inference on new inputs
+        user_input = st.text_area("Enter a new code prompt for inference:")
+        if user_input:
+            inputs = tokenizer(f"translate code to comment: {user_input}", return_tensors="pt", truncation=True)
+            outputs = model.generate(**inputs, max_length=64)
+            generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            st.write("Generated comment:", generated_comment)
+Key Points:
+   •   User Interaction: The interface lets users set hyperparameters, upload datasets, and start fine-tuning.
+   •   Model Integration: It uses Hugging Face’s pre-trained CodeT5 model and tokenizer, then fine-tunes on user-provided examples.
+   •   Reproducibility: The pipeline includes caching, dataset conversion, and saving the final model.
+   •   Extensibility: You can later add more options (e.g., additional hyperparameters, evaluation metrics, visualization of training progress).
+This design should give you a robust, end-to-end solution to let users easily fine-tune a code generation model through a Streamlit interface. Would you like further details on any component of the design?

attached_assets/Pasted-For-a-robust-foundation-you-ll-want-to-configure-a-set-of-tools-that-catch-errors-early-maintain-c-1740904212802.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+For a robust foundation, you’ll want to configure a set of tools that catch errors early, maintain code quality, and ensure your model and interface work as intended. Here’s a comprehensive list:
+   •   Linting and Formatting:
+      •   Pylint or Flake8: Both are excellent for catching stylistic issues, potential errors, and enforcing coding standards.
+      •   Black: An uncompromising code formatter that automatically reformats your code to a consistent style.
+      •   isort: Automatically sorts your imports, keeping them tidy and making merge conflicts less likely.
+      •   mypy: For static type checking—great for catching type mismatches early, especially in larger projects.
+   •   Debugging:
+      •   pdb or ipdb: Python’s built-in debugger (with ipdb providing a friendlier interface) lets you step through code interactively.
+      •   VS Code Debugger: If you’re using VS Code, take advantage of its powerful debugging features with breakpoints, variable inspection, and integrated terminal support.
+      •   Streamlit’s Debugging Tools: Streamlit now offers logging and error traceback views—integrate these for your interface to catch issues on the fly.
+   •   Testing:
+      •   pytest: A flexible testing framework that supports fixtures and parameterized tests. It’s widely used for both unit and integration tests.
+      •   unittest: Python’s built-in framework for basic tests (though pytest often provides a more modern and user-friendly approach).
+      •   coverage.py: To measure how much of your code is exercised by your tests, ensuring thorough test coverage.
+      •   Tox: For running your tests in multiple environments, which is useful if your project depends on various Python versions or dependencies.
+   •   Continuous Integration (CI):
+      •   GitHub Actions or GitLab CI: Automate your linting, testing, and building processes so that every commit triggers your checks—keeping your repository healthy over time.
+Setting these up at the beginning ensures that your code stays clean, errors are caught early, and your automated code generation pipeline is both reliable and production-ready. This not only speeds up development but also builds a solid foundation for scaling and collaboration.

attached_assets/Pasted-For-a-robust-foundation-you-ll-want-to-configure-a-set-of-tools-that-catch-errors-early-maintain-c-1740906031222.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+For a robust foundation, you’ll want to configure a set of tools that catch errors early, maintain code quality, and ensure your model and interface work as intended. Here’s a comprehensive list:
+   •   Linting and Formatting:
+      •   Pylint or Flake8: Both are excellent for catching stylistic issues, potential errors, and enforcing coding standards.
+      •   Black: An uncompromising code formatter that automatically reformats your code to a consistent style.
+      •   isort: Automatically sorts your imports, keeping them tidy and making merge conflicts less likely.
+      •   mypy: For static type checking—great for catching type mismatches early, especially in larger projects.
+   •   Debugging:
+      •   pdb or ipdb: Python’s built-in debugger (with ipdb providing a friendlier interface) lets you step through code interactively.
+      •   VS Code Debugger: If you’re using VS Code, take advantage of its powerful debugging features with breakpoints, variable inspection, and integrated terminal support.
+      •   Streamlit’s Debugging Tools: Streamlit now offers logging and error traceback views—integrate these for your interface to catch issues on the fly.
+   •   Testing:
+      •   pytest: A flexible testing framework that supports fixtures and parameterized tests. It’s widely used for both unit and integration tests.
+      •   unittest: Python’s built-in framework for basic tests (though pytest often provides a more modern and user-friendly approach).
+      •   coverage.py: To measure how much of your code is exercised by your tests, ensuring thorough test coverage.
+      •   Tox: For running your tests in multiple environments, which is useful if your project depends on various Python versions or dependencies.
+   •   Continuous Integration (CI):
+      •   GitHub Actions or GitLab CI: Automate your linting, testing, and building processes so that every commit triggers your checks—keeping your repository healthy over time.
+Setting these up at the beginning ensures that your code stays clean, errors are caught early, and your automated code generation pipeline is both reliable and production-ready. This not only speeds up development but also builds a solid foundation for scaling and collaboration.

components/code_quality.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+Code quality tools and configuration for the application.
+"""
+import streamlit as st
+import subprocess
+import os
+from pathlib import Path
+import tempfile
+import json
+def render_code_quality_tools():
+    """
+    Render the code quality tools interface.
+    """
+    st.markdown("<h2>Code Quality Tools</h2>", unsafe_allow_html=True)
+    # Tabs for different tools
+    tab1, tab2, tab3, tab4 = st.tabs(["Linting", "Formatting", "Type Checking", "Testing"])
+    with tab1:
+        render_linting_tools()
+    with tab2:
+        render_formatting_tools()
+    with tab3:
+        render_type_checking_tools()
+    with tab4:
+        render_testing_tools()
+def render_linting_tools():
+    """
+    Render linting tools interface.
+    """
+    st.markdown("### Linting with Pylint/Flake8")
+    st.markdown("""
+    Linting tools help identify potential errors, enforce coding standards, and encourage best practices.
+    **Available Tools:**
+    - **Pylint**: Comprehensive linter that checks for errors and enforces a coding standard
+    - **Flake8**: Wrapper around PyFlakes, pycodestyle, and McCabe complexity checker
+    """)
+    # File upload for linting
+    uploaded_file = st.file_uploader("Upload Python file for linting", type=["py"])
+    linter = st.radio("Select linter", ["Pylint", "Flake8"])
+    if uploaded_file and st.button("Run Linter"):
+        with st.spinner(f"Running {linter}..."):
+            # Save uploaded file to a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_path = tmp_file.name
+            try:
+                if linter == "Pylint":
+                    # Run pylint
+                    result = subprocess.run(
+                        ["pylint", tmp_path],
+                        capture_output=True,
+                        text=True
+                    )
+                else:
+                    # Run flake8
+                    result = subprocess.run(
+                        ["flake8", tmp_path],
+                        capture_output=True,
+                        text=True
+                    )
+                # Display results
+                st.subheader("Linting Results")
+                if result.returncode == 0:
+                    st.success("No issues found!")
+                else:
+                    st.error("Issues found:")
+                    st.code(result.stdout or result.stderr, language="text")
+            except Exception as e:
+                st.error(f"Error running {linter}: {str(e)}")
+            finally:
+                # Clean up temporary file
+                os.unlink(tmp_path)
+def render_formatting_tools():
+    """
+    Render code formatting tools interface.
+    """
+    st.markdown("### Code Formatting with Black & isort")
+    st.markdown("""
+    Code formatters automatically reformat your code to follow a consistent style.
+    **Available Tools:**
+    - **Black**: The uncompromising Python code formatter
+    - **isort**: A utility to sort imports alphabetically and automatically separate them into sections
+    """)
+    # File upload for formatting
+    uploaded_file = st.file_uploader("Upload Python file for formatting", type=["py"])
+    formatter = st.radio("Select formatter", ["Black", "isort", "Both"])
+    if uploaded_file and st.button("Format Code"):
+        with st.spinner(f"Running {formatter}..."):
+            # Get original code
+            original_code = uploaded_file.getvalue().decode("utf-8")
+            # Save uploaded file to a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_path = tmp_file.name
+            try:
+                formatted_code = ""
+                if formatter in ["Black", "Both"]:
+                    # Run black
+                    result = subprocess.run(
+                        ["black", tmp_path],
+                        capture_output=True,
+                        text=True
+                    )
+                    with open(tmp_path, "r") as f:
+                        formatted_code = f.read()
+                if formatter in ["isort", "Both"]:
+                    # If both, use the code formatted by black
+                    if formatter == "Both":
+                        with open(tmp_path, "w") as f:
+                            f.write(formatted_code)
+                    # Run isort
+                    result = subprocess.run(
+                        ["isort", tmp_path],
+                        capture_output=True,
+                        text=True
+                    )
+                    with open(tmp_path, "r") as f:
+                        formatted_code = f.read()
+                # Display results side by side
+                st.subheader("Formatting Results")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.markdown("#### Original Code")
+                    st.code(original_code, language="python")
+                with col2:
+                    st.markdown("#### Formatted Code")
+                    st.code(formatted_code, language="python")
+            except Exception as e:
+                st.error(f"Error running {formatter}: {str(e)}")
+            finally:
+                # Clean up temporary file
+                os.unlink(tmp_path)
+def render_type_checking_tools():
+    """
+    Render type checking tools interface.
+    """
+    st.markdown("### Type Checking with mypy")
+    st.markdown("""
+    Static type checking helps catch type errors before runtime.
+    **Available Tool:**
+    - **mypy**: Optional static typing for Python
+    """)
+    # File upload for type checking
+    uploaded_file = st.file_uploader("Upload Python file for type checking", type=["py"])
+    if uploaded_file and st.button("Check Types"):
+        with st.spinner("Running mypy..."):
+            # Save uploaded file to a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_path = tmp_file.name
+            try:
+                # Run mypy
+                result = subprocess.run(
+                    ["mypy", tmp_path],
+                    capture_output=True,
+                    text=True
+                )
+                # Display results
+                st.subheader("Type Checking Results")
+                if result.returncode == 0:
+                    st.success("No type issues found!")
+                else:
+                    st.error("Type issues found:")
+                    st.code(result.stdout or result.stderr, language="text")
+            except Exception as e:
+                st.error(f"Error running mypy: {str(e)}")
+            finally:
+                # Clean up temporary file
+                os.unlink(tmp_path)
+def render_testing_tools():
+    """
+    Render testing tools interface.
+    """
+    st.markdown("### Testing with pytest")
+    st.markdown("""
+    Testing frameworks help ensure your code works as expected.
+    **Available Tool:**
+    - **pytest**: Simple and powerful testing framework
+    """)
+    # Test file upload
+    test_file = st.file_uploader("Upload test file", type=["py"])
+    # Code file upload (optional)
+    code_file = st.file_uploader("Upload code file to test (optional)", type=["py"])
+    if test_file and st.button("Run Tests"):
+        with st.spinner("Running tests..."):
+            # Create temporary directory for test files
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                # Save test file
+                test_path = os.path.join(tmp_dir, "test_" + test_file.name)
+                with open(test_path, "wb") as f:
+                    f.write(test_file.getvalue())
+                # Save code file if provided
+                if code_file:
+                    code_path = os.path.join(tmp_dir, code_file.name)
+                    with open(code_path, "wb") as f:
+                        f.write(code_file.getvalue())
+                try:
+                    # Run pytest
+                    result = subprocess.run(
+                        ["pytest", "-v", test_path],
+                        capture_output=True,
+                        text=True
+                    )
+                    # Display results
+                    st.subheader("Test Results")
+                    st.code(result.stdout, language="text")
+                    if result.returncode == 0:
+                        st.success("All tests passed!")
+                    else:
+                        st.error("Some tests failed.")
+                except Exception as e:
+                    st.error(f"Error running tests: {str(e)}")
+def create_pylintrc():
+    """
+    Create a sample pylintrc configuration file.
+    """
+    pylintrc = """[MASTER]
+# Python version
+py-version = 3.8
+# Parallel processing
+jobs = 1
+[MESSAGES CONTROL]
+# Disable specific messages
+disable=
+    C0111, # missing-docstring
+    C0103, # invalid-name
+    R0903, # too-few-public-methods
+    R0913, # too-many-arguments
+    W0511, # fixme
+[FORMAT]
+# Maximum line length
+max-line-length = 100
+# Expected indentation
+indent-string = '    '
+[DESIGN]
+# Maximum number of locals for function / method body
+max-locals = 15
+# Maximum number of arguments for function / method
+max-args = 5
+# Maximum number of attributes for a class
+max-attributes = 7
+"""
+    return pylintrc
+def create_flake8_config():
+    """
+    Create a sample flake8 configuration file.
+    """
+    flake8_config = """[flake8]
+max-line-length = 100
+exclude = .git,__pycache__,build,dist
+ignore =
+    E203, # whitespace before ':'
+    E501, # line too long
+    W503  # line break before binary operator
+"""
+    return flake8_config
+def create_mypy_config():
+    """
+    Create a sample mypy configuration file.
+    """
+    mypy_config = """[mypy]
+python_version = 3.8
+warn_return_any = True
+warn_unused_configs = True
+disallow_untyped_defs = False
+disallow_incomplete_defs = False
+[mypy.plugins.numpy.*]
+follow_imports = skip
+[mypy.plugins.pandas.*]
+follow_imports = skip
+"""
+    return mypy_config
+def create_pytest_config():
+    """
+    Create a sample pytest configuration file.
+    """
+    pytest_config = """[pytest]
+testpaths = tests
+python_files = test_*.py
+python_functions = test_*
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    integration: marks tests as integration tests
+"""
+    return pytest_config

components/dataset_preview.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import streamlit as st
+import pandas as pd
+import json
+def render_dataset_preview(dataset, dataset_type):
+    """
+    Renders a preview of the dataset with pagination options.
+    Args:
+        dataset: The dataset to preview (pandas DataFrame)
+        dataset_type: The type of dataset (csv, json, etc.)
+    """
+    if dataset is None:
+        st.warning("No dataset to preview.")
+        return
+    st.markdown(f"<h3>Dataset Preview: {st.session_state.dataset_name}</h3>", unsafe_allow_html=True)
+    # Show basic info
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Rows", f"{dataset.shape[0]:,}")
+    with col2:
+        st.metric("Columns", f"{dataset.shape[1]:,}")
+    with col3:
+        st.metric("Type", dataset_type.upper())
+    # Preview options
+    col1, col2 = st.columns([1, 3])
+    with col1:
+        num_rows = st.number_input("Rows to display", min_value=5, max_value=100, value=10, step=5)
+    with col2:
+        preview_mode = st.radio("Preview mode", ["Head", "Tail", "Sample"], horizontal=True)
+    # Display dataset preview
+    st.markdown("<div class='dataset-preview'>", unsafe_allow_html=True)
+    if preview_mode == "Head":
+        st.dataframe(dataset.head(num_rows), use_container_width=True)
+    elif preview_mode == "Tail":
+        st.dataframe(dataset.tail(num_rows), use_container_width=True)
+    else:  # Sample
+        st.dataframe(dataset.sample(min(num_rows, len(dataset))), use_container_width=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+    # Show dataset schema
+    with st.expander("Dataset Schema"):
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("**Column Types**")
+            type_df = pd.DataFrame({
+                'Column': dataset.dtypes.index,
+                'Type': dataset.dtypes.values.astype(str)
+            })
+            st.dataframe(type_df, use_container_width=True)
+        with col2:
+            st.markdown("**Missing Values**")
+            missing_df = pd.DataFrame({
+                'Column': dataset.columns,
+                'Missing': dataset.isna().sum().values,
+                'Percentage': dataset.isna().sum().values / len(dataset) * 100
+            })
+            st.dataframe(missing_df.style.format({
+                'Percentage': '{:.2f}%'
+            }), use_container_width=True)
+    # Raw data
+    with st.expander("Raw Data (First 5 records)"):
+        if dataset_type == 'csv':
+            st.code(dataset.head(5).to_csv(index=False), language="text")
+        else:  # json or jsonl
+            st.code(dataset.head(5).to_json(orient='records', indent=2), language="json")

components/dataset_statistics.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+def render_dataset_statistics(dataset, dataset_type):
+    """
+    Renders statistical analysis of the dataset.
+    Args:
+        dataset: The dataset to analyze (pandas DataFrame)
+        dataset_type: The type of dataset (csv, json, etc.)
+    """
+    if dataset is None:
+        st.warning("No dataset to analyze.")
+        return
+    st.markdown("<h3>Dataset Statistics</h3>", unsafe_allow_html=True)
+    # Tabs for different kinds of statistics
+    tab1, tab2, tab3 = st.tabs(["Summary Statistics", "Distribution Analysis", "Correlation Analysis"])
+    with tab1:
+        # Summary statistics
+        st.markdown("### Summary Statistics")
+        # Filter only numeric columns for statistics
+        numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
+        if numeric_cols:
+            # Display summary statistics
+            st.dataframe(dataset[numeric_cols].describe().T.style.highlight_max(axis=1, color='#FFD21E'), use_container_width=True)
+            # Top values for categorical columns
+            categorical_cols = dataset.select_dtypes(exclude=[np.number]).columns.tolist()
+            if categorical_cols:
+                st.markdown("### Category Value Counts")
+                selected_cat_col = st.selectbox("Select categorical column", categorical_cols)
+                # Show top values and their counts
+                value_counts = dataset[selected_cat_col].value_counts().head(10)
+                fig = px.bar(
+                    x=value_counts.index,
+                    y=value_counts.values,
+                    title=f"Top 10 values in {selected_cat_col}",
+                    labels={"x": selected_cat_col, "y": "Count"},
+                    color_discrete_sequence=["#2563EB"]
+                )
+                st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No numeric columns found in the dataset.")
+    with tab2:
+        # Distribution analysis
+        st.markdown("### Distribution Analysis")
+        if numeric_cols:
+            selected_num_col = st.selectbox("Select numeric column", numeric_cols)
+            # Create distribution plot
+            fig = px.histogram(
+                dataset,
+                x=selected_num_col,
+                title=f"Distribution of {selected_num_col}",
+                marginal="box",
+                color_discrete_sequence=["#FFD21E"],
+                template="simple_white"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # Basic distribution stats
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Mean", f"{dataset[selected_num_col].mean():.2f}")
+            with col2:
+                st.metric("Median", f"{dataset[selected_num_col].median():.2f}")
+            with col3:
+                st.metric("Min", f"{dataset[selected_num_col].min():.2f}")
+            with col4:
+                st.metric("Max", f"{dataset[selected_num_col].max():.2f}")
+        else:
+            st.warning("No numeric columns found in the dataset.")
+    with tab3:
+        # Correlation analysis
+        st.markdown("### Correlation Analysis")
+        if len(numeric_cols) > 1:
+            # Compute correlation matrix
+            corr_matrix = dataset[numeric_cols].corr()
+            # Plot heatmap
+            fig = px.imshow(
+                corr_matrix,
+                color_continuous_scale=["#84919A", "#FFFFFF", "#FFD21E"],
+                title="Correlation Matrix",
+                template="simple_white"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # Top correlated features
+            st.markdown("### Top Correlated Features")
+            # Convert correlation matrix to a long format
+            corr_pairs = []
+            for i in range(len(corr_matrix.columns)):
+                for j in range(i+1, len(corr_matrix.columns)):
+                    col1 = corr_matrix.columns[i]
+                    col2 = corr_matrix.columns[j]
+                    corr_value = corr_matrix.iloc[i, j]
+                    corr_pairs.append((col1, col2, corr_value))
+            # Sort by absolute correlation
+            corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
+            # Display top 10 correlated pairs
+            if corr_pairs:
+                top_pairs = pd.DataFrame(corr_pairs[:10], columns=["Feature 1", "Feature 2", "Correlation"])
+                st.dataframe(
+                    top_pairs.style.format({
+                        "Correlation": "{:.4f}"
+                    }).background_gradient(subset=["Correlation"], cmap="coolwarm"),
+                    use_container_width=True
+                )
+                # Scatter plot for the top correlated pair
+                if corr_pairs:
+                    top_pair = corr_pairs[0]
+                    fig = px.scatter(
+                        dataset,
+                        x=top_pair[0],
+                        y=top_pair[1],
+                        title=f"Scatter plot: {top_pair[0]} vs {top_pair[1]} (Corr: {top_pair[2]:.4f})",
+                        color_discrete_sequence=["#2563EB"],
+                        template="simple_white"
+                    )
+                    fig.add_traces(
+                        go.Scatter(
+                            x=[None],
+                            y=[None],
+                            mode='lines',
+                            line=dict(color="#FFD21E", width=3),
+                            name='Best Fit'
+                        )
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("Need at least two numeric columns for correlation analysis.")

components/dataset_uploader.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+import pandas as pd
+import json
+import io
+from utils.dataset_utils import get_dataset_info, detect_dataset_format
+def render_dataset_uploader():
+    """
+    Renders the dataset upload component that supports CSV and JSON formats.
+    """
+    st.markdown("""
+    <div class="upload-container">
+        <p>Upload your dataset in CSV or JSON format</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "Choose a file",
+        type=["csv", "json"],
+        help="Upload a CSV or JSON file containing your dataset"
+    )
+    # Sample dataset option
+    st.markdown("Or use a sample dataset:")
+    sample_dataset = st.selectbox(
+        "Select a sample dataset",
+        ["None", "Iris Dataset", "Titanic Dataset", "Boston Housing Dataset"]
+    )
+    # Process uploaded file
+    if uploaded_file is not None:
+        try:
+            # Check file extension
+            file_extension = uploaded_file.name.split(".")[-1].lower()
+            if file_extension == "csv":
+                df = pd.read_csv(uploaded_file)
+                dataset_type = "csv"
+            elif file_extension == "json":
+                # Try different JSON formats
+                try:
+                    # First try parsing as a regular JSON with records orientation
+                    df = pd.read_json(uploaded_file)
+                    dataset_type = "json"
+                except:
+                    # If that fails, try to parse as JSON Lines
+                    try:
+                        df = pd.read_json(uploaded_file, lines=True)
+                        dataset_type = "jsonl"
+                    except:
+                        # If that also fails, load raw JSON and convert
+                        content = json.loads(uploaded_file.getvalue().decode("utf-8"))
+                        if isinstance(content, list):
+                            df = pd.DataFrame(content)
+                        elif isinstance(content, dict):
+                            # Handle nested dict structures
+                            if any(isinstance(v, list) for v in content.values()):
+                                # Find the list field and use it
+                                for key, value in content.items():
+                                    if isinstance(value, list):
+                                        df = pd.DataFrame(value)
+                                        break
+                            else:
+                                # Flat dict or dict of dicts
+                                df = pd.DataFrame([content])
+                        dataset_type = "json"
+            else:
+                st.error(f"Unsupported file format: {file_extension}")
+                return
+            # Store dataset and its info in session state
+            st.session_state.dataset = df
+            st.session_state.dataset_name = uploaded_file.name
+            st.session_state.dataset_type = dataset_type
+            st.session_state.dataset_info = get_dataset_info(df)
+        except Exception as e:
+            st.error(f"Error loading dataset: {str(e)}")
+    # Process sample dataset
+    elif sample_dataset != "None":
+        try:
+            if sample_dataset == "Iris Dataset":
+                # Load Iris dataset
+                from sklearn.datasets import load_iris
+                iris = load_iris()
+                df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
+                df['target'] = iris.target
+                dataset_type = "csv"
+            elif sample_dataset == "Titanic Dataset":
+                # URL for Titanic dataset
+                url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
+                df = pd.read_csv(url)
+                dataset_type = "csv"
+            elif sample_dataset == "Boston Housing Dataset":
+                # Load Boston Housing dataset
+                from sklearn.datasets import fetch_california_housing
+                housing = fetch_california_housing()
+                df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
+                df['target'] = housing.target
+                dataset_type = "csv"
+            # Store dataset and its info in session state
+            st.session_state.dataset = df
+            st.session_state.dataset_name = sample_dataset
+            st.session_state.dataset_type = dataset_type
+            st.session_state.dataset_info = get_dataset_info(df)
+        except Exception as e:
+            st.error(f"Error loading sample dataset: {str(e)}")

components/dataset_validation.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import json
+from utils.dataset_utils import check_column_completeness, detect_outliers
+def render_dataset_validation(dataset, dataset_type):
+    """
+    Renders validation checks for the dataset.
+    Args:
+        dataset: The dataset to validate (pandas DataFrame)
+        dataset_type: The type of dataset (csv, json, etc.)
+    """
+    if dataset is None:
+        st.warning("No dataset to validate.")
+        return
+    st.markdown("<h3>Dataset Validation</h3>", unsafe_allow_html=True)
+    # Data quality metrics
+    col1, col2, col3, col4 = st.columns(4)
+    # Calculate data quality metrics
+    total_cells = dataset.shape[0] * dataset.shape[1]
+    missing_cells = dataset.isna().sum().sum()
+    missing_percentage = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
+    duplicate_rows = dataset.duplicated().sum()
+    duplicate_percentage = (duplicate_rows / dataset.shape[0]) * 100 if dataset.shape[0] > 0 else 0
+    with col1:
+        st.metric("Completeness", f"{100 - missing_percentage:.2f}%")
+    with col2:
+        st.metric("Missing Values", f"{missing_cells:,} ({missing_percentage:.2f}%)")
+    with col3:
+        st.metric("Duplicate Rows", f"{duplicate_rows:,} ({duplicate_percentage:.2f}%)")
+    with col4:
+        # Quality score is a simple metric between 0-100 based on completeness and duplicates
+        quality_score = 100 - (missing_percentage + duplicate_percentage)
+        quality_score = max(0, min(100, quality_score))  # Clamp between 0 and 100
+        st.metric("Quality Score", f"{quality_score:.2f}/100")
+    # Tabs for different validation aspects
+    tab1, tab2 = st.tabs(["Data Quality Issues", "Anomaly Detection"])
+    with tab1:
+        st.markdown("### Data Quality Issues")
+        # Check for missing values by column
+        missing_by_col = dataset.isna().sum()
+        missing_by_col = missing_by_col[missing_by_col > 0]
+        if not missing_by_col.empty:
+            st.markdown("#### Missing Values by Column")
+            missing_df = pd.DataFrame({
+                'Column': missing_by_col.index,
+                'Missing Count': missing_by_col.values,
+                'Percentage': (missing_by_col.values / dataset.shape[0] * 100).round(2)
+            })
+            missing_df['Status'] = missing_df['Percentage'].apply(
+                lambda x: "🟢 Good" if x < 5 else ("🟠 Warning" if x < 20 else "🔴 Critical")
+            )
+            st.dataframe(
+                missing_df.style.format({
+                    'Percentage': '{:.2f}%'
+                }).background_gradient(subset=['Percentage'], cmap='Reds'),
+                use_container_width=True
+            )
+        else:
+            st.success("No missing values found in the dataset!")
+        # Check for duplicate rows
+        if duplicate_rows > 0:
+            st.markdown("#### Duplicate Rows")
+            st.warning(f"Found {duplicate_rows} duplicate rows ({duplicate_percentage:.2f}% of the dataset)")
+            # Option to show duplicates
+            if st.checkbox("Show duplicates"):
+                st.dataframe(dataset[dataset.duplicated(keep='first')], use_container_width=True)
+        else:
+            st.success("No duplicate rows found in the dataset!")
+        # Check column data types
+        st.markdown("#### Column Data Types")
+        type_issues = []
+        for col in dataset.columns:
+            dtype = dataset[col].dtype
+            if dtype == 'object':
+                # Check if it could be numeric
+                try:
+                    # Try to convert a sample to numeric
+                    sample = dataset[col].dropna().head(100)
+                    if len(sample) > 0:
+                        numeric_count = pd.to_numeric(sample, errors='coerce').notna().sum()
+                        if numeric_count / len(sample) > 0.8:  # If more than 80% can be converted
+                            type_issues.append({
+                                'Column': col,
+                                'Current Type': 'object',
+                                'Suggested Type': 'numeric',
+                                'Issue': 'Column contains mostly numeric values but is stored as text'
+                            })
+                            continue
+                except:
+                    pass
+                # Check if it could be datetime
+                try:
+                    sample = dataset[col].dropna().head(100)
+                    if len(sample) > 0:
+                        datetime_count = pd.to_datetime(sample, errors='coerce').notna().sum()
+                        if datetime_count / len(sample) > 0.8:  # If more than 80% can be converted
+                            type_issues.append({
+                                'Column': col,
+                                'Current Type': 'object',
+                                'Suggested Type': 'datetime',
+                                'Issue': 'Column contains mostly dates but is stored as text'
+                            })
+                except:
+                    pass
+        if type_issues:
+            st.dataframe(pd.DataFrame(type_issues), use_container_width=True)
+        else:
+            st.success("No data type issues detected!")
+        # Check for column completeness
+        st.markdown("#### Column Completeness Check")
+        completeness_results = check_column_completeness(dataset)
+        if completeness_results:
+            st.dataframe(pd.DataFrame(completeness_results), use_container_width=True)
+        else:
+            st.success("All columns have good completeness!")
+    with tab2:
+        st.markdown("### Anomaly Detection")
+        # Detect outliers in numeric columns
+        numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
+        if numeric_cols:
+            selected_num_col = st.selectbox("Select column to check for outliers", numeric_cols)
+            outliers, lower_bound, upper_bound = detect_outliers(dataset[selected_num_col])
+            outlier_percentage = (len(outliers) / len(dataset)) * 100
+            st.markdown(f"#### Outliers in column: {selected_num_col}")
+            st.metric("Outliers Detected", f"{len(outliers)} ({outlier_percentage:.2f}%)")
+            st.markdown(f"""
+            **Bounds for outlier detection:**
+            - Lower bound: {lower_bound:.4f}
+            - Upper bound: {upper_bound:.4f}
+            """)
+            if len(outliers) > 0:
+                # Plot with outliers highlighted
+                import plotly.express as px
+                # Create a new column for coloring
+                temp_df = dataset.copy()
+                temp_df['is_outlier'] = temp_df.index.isin(outliers)
+                fig = px.box(
+                    temp_df,
+                    y=selected_num_col,
+                    color='is_outlier',
+                    color_discrete_map={True: "#FF5757", False: "#2563EB"},
+                    title=f"Outliers in {selected_num_col}",
+                    labels={"is_outlier": "Is Outlier"}
+                )
+                st.plotly_chart(fig, use_container_width=True)
+                # Option to show outliers in table
+                if st.checkbox("Show outlier data"):
+                    st.dataframe(dataset.loc[outliers], use_container_width=True)
+            else:
+                st.success(f"No outliers detected in {selected_num_col}!")
+        else:
+            st.warning("No numeric columns found for outlier detection.")

components/dataset_version_control.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Dataset version control UI component for the ML Dataset & Code Generation Manager.
+Provides UI for viewing, comparing, and restoring dataset versions.
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import datetime
+import hashlib
+import plotly.express as px
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+from database import version_control
+def render_version_control_ui(dataset_id: int, df: Optional[pd.DataFrame] = None):
+    """
+    Render the version control UI for a dataset
+    Args:
+        dataset_id: ID of the dataset
+        df: Current DataFrame of the dataset (optional)
+    """
+    st.header("Dataset Version Control")
+    # Get all versions of the dataset
+    versions = version_control.get_versions(dataset_id)
+    if not versions:
+        st.info("No versions found for this dataset. Save changes to create the first version.")
+        if df is not None and st.button("Create Initial Version"):
+            version = version_control.create_version(
+                dataset_id=dataset_id,
+                df=df,
+                description="Initial version"
+            )
+            st.success(f"Created initial version: {version.version_id}")
+            st.experimental_rerun()
+        return
+    # Display version history
+    st.subheader("Version History")
+    version_data = []
+    for v in versions:
+        version_data.append({
+            "Version ID": v.version_id,
+            "Date": v.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
+            "Rows": v.metadata.get("rows", "N/A"),
+            "Columns": v.metadata.get("columns", "N/A"),
+            "Description": v.description
+        })
+    version_df = pd.DataFrame(version_data)
+    st.dataframe(version_df, use_container_width=True)
+    # Version actions section
+    st.subheader("Version Actions")
+    col1, col2 = st.columns(2)
+    with col1:
+        selected_version = st.selectbox(
+            "Select Version",
+            options=[v.version_id for v in versions],
+            format_func=lambda x: f"{x} - {next((v.timestamp.strftime('%Y-%m-%d %H:%M:%S') for v in versions if v.version_id == x), '')}"
+        )
+        # Get selected version object
+        selected_v = next((v for v in versions if v.version_id == selected_version), None)
+        if selected_v:
+            st.write(f"**Description:** {selected_v.description}")
+            st.write(f"**Created:** {selected_v.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
+            # Display metadata
+            if selected_v.metadata:
+                with st.expander("Version Metadata"):
+                    for key, value in selected_v.metadata.items():
+                        if key != "column_names":  # Show column names separately
+                            st.write(f"**{key}:** {value}")
+                    if "column_names" in selected_v.metadata:
+                        st.write("**Columns:**")
+                        st.write(", ".join(selected_v.metadata["column_names"]))
+    with col2:
+        st.write("**Actions:**")
+        if selected_v:
+            # Load selected version
+            if st.button("View Version Data"):
+                version_df = version_control.load_version_data(selected_v)
+                st.session_state["viewing_version_df"] = version_df
+                st.session_state["viewing_version_id"] = selected_v.version_id
+            # Restore version
+            if st.button("Restore This Version"):
+                if df is not None:
+                    description = st.session_state.get("restore_description", f"Restored from {selected_v.version_id}")
+                    new_version = version_control.restore_version(
+                        dataset_id=dataset_id,
+                        version_id=selected_v.version_id,
+                        description=description
+                    )
+                    st.success(f"Restored version {selected_v.version_id} as new version {new_version.version_id}")
+                    st.experimental_rerun()
+                else:
+                    st.error("Cannot restore version: No dataset provided")
+        # Compare versions
+        if len(versions) > 1:
+            st.write("**Compare Versions:**")
+            compare_v1 = st.selectbox("Version 1", options=[v.version_id for v in versions], key="compare_v1")
+            compare_v2 = st.selectbox("Version 2", options=[v.version_id for v in versions], key="compare_v2")
+            if st.button("Compare Versions"):
+                if compare_v1 != compare_v2:
+                    comparison = version_control.compare_versions(
+                        dataset_id=dataset_id,
+                        version_id1=compare_v1,
+                        version_id2=compare_v2
+                    )
+                    st.session_state["version_comparison"] = comparison
+                else:
+                    st.warning("Please select different versions to compare")
+    # Show version data if requested
+    if "viewing_version_df" in st.session_state:
+        st.subheader(f"Data for Version: {st.session_state['viewing_version_id']}")
+        st.dataframe(st.session_state["viewing_version_df"], use_container_width=True)
+        if st.button("Clear Version View"):
+            del st.session_state["viewing_version_df"]
+            del st.session_state["viewing_version_id"]
+            st.experimental_rerun()
+    # Show version comparison if requested
+    if "version_comparison" in st.session_state:
+        comparison = st.session_state["version_comparison"]
+        st.subheader(f"Version Comparison")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write(f"**Version 1:** {comparison['version1']}")
+            st.write(f"**Date:** {comparison['version1_timestamp'].strftime('%Y-%m-%d %H:%M:%S')}")
+        with col2:
+            st.write(f"**Version 2:** {comparison['version2']}")
+            st.write(f"**Date:** {comparison['version2_timestamp'].strftime('%Y-%m-%d %H:%M:%S')}")
+        st.write(f"**Rows Changed:** {comparison['rows_diff']} ({'+' if comparison['rows_diff'] > 0 else ''}{comparison['rows_diff']})")
+        if comparison["columns_added"]:
+            st.write("**Columns Added:**")
+            for col in comparison["columns_added"]:
+                st.write(f"- {col}")
+        if comparison["columns_removed"]:
+            st.write("**Columns Removed:**")
+            for col in comparison["columns_removed"]:
+                st.write(f"- {col}")
+        if comparison["columns_diff"]:
+            st.write("**Columns Changed:**")
+            for col, diff in comparison["columns_diff"].items():
+                if diff.get("type_changed", False):
+                    st.write(f"- {col}: Type changed from {diff['type1']} to {diff['type2']}")
+                elif diff.get("values_changed", False):
+                    st.write(f"- {col}: Values changed")
+        if st.button("Clear Comparison"):
+            del st.session_state["version_comparison"]
+            st.experimental_rerun()
+def render_save_version_ui(dataset_id: int, df: pd.DataFrame):
+    """
+    Render UI for saving a new version of a dataset
+    Args:
+        dataset_id: ID of the dataset
+        df: DataFrame to save
+    """
+    st.subheader("Save Current Version")
+    # Get latest version if any
+    latest_version = version_control.get_latest_version(dataset_id)
+    # Calculate changes if a previous version exists
+    if latest_version:
+        try:
+            prev_df = version_control.load_version_data(latest_version)
+            rows_diff = len(df) - len(prev_df)
+            cols_diff = len(df.columns) - len(prev_df.columns)
+            st.write(f"Changes from last version:")
+            st.write(f"- Rows: {'+' if rows_diff > 0 else ''}{rows_diff}")
+            st.write(f"- Columns: {'+' if cols_diff > 0 else ''}{cols_diff}")
+            # Check content hash
+            current_hash = hashlib.md5(df.to_json().encode()).hexdigest()[:8]
+            if current_hash == latest_version.metadata.get("content_hash"):
+                st.info("No changes detected in the data content since the last version.")
+        except:
+            st.warning("Could not compare with previous version.")
+    # Input for version description
+    description = st.text_area("Version Description", placeholder="Describe the changes in this version", key="version_description")
+    # Save button
+    if st.button("Save Version"):
+        version = version_control.create_version(
+            dataset_id=dataset_id,
+            df=df,
+            description=description
+        )
+        st.success(f"Created new version: {version.version_id}")
+        return version
+    return None
+def render_version_visualization(dataset_id: int):
+    """
+    Render visualization of dataset versions
+    Args:
+        dataset_id: ID of the dataset
+    """
+    versions = version_control.get_versions(dataset_id)
+    if not versions:
+        st.info("No versions available to visualize.")
+        return
+    st.subheader("Version Metrics Visualization")
+    # Prepare data for visualization
+    viz_data = []
+    for version in versions:
+        viz_data.append({
+            "Version": version.version_id[:8] + "...",  # Truncated ID for display
+            "Date": version.timestamp,
+            "Rows": version.metadata.get("rows", 0),
+            "Columns": version.metadata.get("columns", 0),
+            "Full Version ID": version.version_id,  # For tooltip
+            "Description": version.description
+        })
+    viz_df = pd.DataFrame(viz_data)
+    # Visualize row counts over versions
+    fig1 = px.line(
+        viz_df,
+        x="Date",
+        y="Rows",
+        title="Dataset Size (Rows) Across Versions",
+        markers=True,
+        hover_data=["Full Version ID", "Description"]
+    )
+    st.plotly_chart(fig1, use_container_width=True)
+    # Visualize column counts over versions
+    fig2 = px.line(
+        viz_df,
+        x="Date",
+        y="Columns",
+        title="Dataset Structure (Columns) Across Versions",
+        markers=True,
+        hover_data=["Full Version ID", "Description"]
+    )
+    st.plotly_chart(fig2, use_container_width=True)

components/dataset_visualization.py ADDED Viewed

	@@ -0,0 +1,502 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+def render_dataset_visualization(dataset, dataset_type):
+    """
+    Renders visualizations for the dataset.
+    Args:
+        dataset: The dataset to visualize (pandas DataFrame)
+        dataset_type: The type of dataset (csv, json, etc.)
+    """
+    if dataset is None:
+        st.warning("No dataset to visualize.")
+        return
+    st.markdown("<h3>Dataset Visualization</h3>", unsafe_allow_html=True)
+    # Get column types
+    numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns.tolist()
+    date_cols = [col for col in dataset.columns if dataset[col].dtype == 'datetime64[ns]']
+    # Add visualization options based on column types
+    viz_type = st.selectbox(
+        "Select visualization type",
+        ["Distribution", "Correlation", "Categories", "Time Series", "Custom"],
+        help="Choose the type of visualization to create"
+    )
+    if viz_type == "Distribution":
+        if numeric_cols:
+            # Select columns for distribution visualization
+            selected_cols = st.multiselect(
+                "Select columns to visualize",
+                numeric_cols,
+                default=numeric_cols[:min(3, len(numeric_cols))]
+            )
+            if not selected_cols:
+                st.warning("Please select at least one column to visualize.")
+                return
+            # Distribution plots
+            if len(selected_cols) == 1:
+                # Single column histogram with density curve
+                col = selected_cols[0]
+                fig = px.histogram(
+                    dataset,
+                    x=col,
+                    histnorm='probability density',
+                    title=f"Distribution of {col}",
+                    color_discrete_sequence=["#FFD21E"],
+                    template="simple_white"
+                )
+                fig.add_traces(
+                    go.Scatter(
+                        x=dataset[col].sort_values(),
+                        y=dataset[col].sort_values().reset_index(drop=True).rolling(
+                            window=int(len(dataset[col])/10) if len(dataset[col]) > 10 else len(dataset[col]),
+                            min_periods=1,
+                            center=True
+                        ).mean(),
+                        mode='lines',
+                        line=dict(color="#2563EB", width=3),
+                        name='Smoothed'
+                    )
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                # Multiple histograms in a grid
+                num_cols = min(len(selected_cols), 2)
+                num_rows = (len(selected_cols) + num_cols - 1) // num_cols
+                fig = make_subplots(
+                    rows=num_rows,
+                    cols=num_cols,
+                    subplot_titles=[f"Distribution of {col}" for col in selected_cols]
+                )
+                for i, col in enumerate(selected_cols):
+                    row = i // num_cols + 1
+                    col_pos = i % num_cols + 1
+                    # Add histogram
+                    fig.add_trace(
+                        go.Histogram(
+                            x=dataset[col],
+                            name=col,
+                            marker_color="#FFD21E"
+                        ),
+                        row=row, col=col_pos
+                    )
+                fig.update_layout(
+                    title="Distribution of Selected Features",
+                    showlegend=False,
+                    template="simple_white",
+                    height=300 * num_rows
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            # Show distribution statistics
+            st.markdown("### Distribution Statistics")
+            stats_df = dataset[selected_cols].describe().T
+            st.dataframe(stats_df, use_container_width=True)
+        else:
+            st.warning("No numeric columns found for distribution visualization.")
+    elif viz_type == "Correlation":
+        if len(numeric_cols) >= 2:
+            # Correlation matrix
+            st.markdown("### Correlation Matrix")
+            # Select columns for correlation
+            selected_cols = st.multiselect(
+                "Select columns for correlation analysis",
+                numeric_cols,
+                default=numeric_cols[:min(5, len(numeric_cols))]
+            )
+            if len(selected_cols) < 2:
+                st.warning("Please select at least two columns for correlation analysis.")
+                return
+            # Compute correlation
+            corr = dataset[selected_cols].corr()
+            # Heatmap
+            fig = px.imshow(
+                corr,
+                color_continuous_scale="RdBu_r",
+                title="Correlation Matrix",
+                template="simple_white",
+                text_auto=True
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # Scatter plot matrix for selected columns
+            if len(selected_cols) > 2 and len(selected_cols) <= 5:  # Limit to 5 columns for readability
+                st.markdown("### Scatter Plot Matrix")
+                fig = px.scatter_matrix(
+                    dataset,
+                    dimensions=selected_cols,
+                    color_discrete_sequence=["#2563EB"],
+                    title="Scatter Plot Matrix",
+                    template="simple_white"
+                )
+                fig.update_traces(diagonal_visible=False)
+                st.plotly_chart(fig, use_container_width=True)
+            # Correlation pairs as bar chart
+            st.markdown("### Top Correlation Pairs")
+            # Get correlation pairs
+            corr_pairs = []
+            for i in range(len(corr.columns)):
+                for j in range(i+1, len(corr.columns)):
+                    corr_pairs.append({
+                        'Feature 1': corr.columns[i],
+                        'Feature 2': corr.columns[j],
+                        'Correlation': corr.iloc[i, j]
+                    })
+            # Sort by absolute correlation
+            corr_pairs = sorted(corr_pairs, key=lambda x: abs(x['Correlation']), reverse=True)
+            # Create bar chart
+            if corr_pairs:
+                # Convert to DataFrame
+                corr_df = pd.DataFrame(corr_pairs)
+                pair_labels = [f"{row['Feature 1']} & {row['Feature 2']}" for _, row in corr_df.iterrows()]
+                # Bar chart
+                fig = px.bar(
+                    x=pair_labels,
+                    y=[abs(c) for c in corr_df['Correlation']],
+                    color=corr_df['Correlation'],
+                    color_continuous_scale="RdBu_r",
+                    labels={'x': 'Feature Pairs', 'y': 'Absolute Correlation'},
+                    title="Top Feature Correlations"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("Need at least two numeric columns for correlation analysis.")
+    elif viz_type == "Categories":
+        if categorical_cols:
+            # Select categorical column
+            selected_cat = st.selectbox("Select categorical column", categorical_cols)
+            # Category counts
+            value_counts = dataset[selected_cat].value_counts()
+            # Limit to top N categories if there are too many
+            if len(value_counts) > 20:
+                st.info(f"Showing top 20 categories out of {len(value_counts)}")
+                value_counts = value_counts.head(20)
+            # Bar chart
+            fig = px.bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                title=f"Category Counts for {selected_cat}",
+                labels={'x': selected_cat, 'y': 'Count'},
+                color_discrete_sequence=["#FFD21E"]
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # If there are numeric columns, show relationship with categorical
+            if numeric_cols:
+                st.markdown(f"### {selected_cat} vs Numeric Features")
+                selected_num = st.selectbox("Select numeric column", numeric_cols)
+                # Box plot
+                fig = px.box(
+                    dataset,
+                    x=selected_cat,
+                    y=selected_num,
+                    title=f"{selected_cat} vs {selected_num}",
+                    color_discrete_sequence=["#2563EB"],
+                    template="simple_white"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+                # Statistics by category
+                st.markdown(f"### Statistics of {selected_num} by {selected_cat}")
+                stats_by_cat = dataset.groupby(selected_cat)[selected_num].describe()
+                st.dataframe(stats_by_cat, use_container_width=True)
+        else:
+            st.warning("No categorical columns found for category visualization.")
+    elif viz_type == "Time Series":
+        # Check if there are potential date columns
+        potential_date_cols = date_cols.copy()
+        # Also check for object columns that might be dates
+        for col in categorical_cols:
+            # Sample the column to check if it contains date-like strings
+            sample = dataset[col].dropna().head(5).tolist()
+            if sample and all('/' in str(x) or '-' in str(x) for x in sample):
+                potential_date_cols.append(col)
+        if potential_date_cols:
+            date_col = st.selectbox("Select date column", potential_date_cols)
+            # Convert to datetime if it's not already
+            if dataset[date_col].dtype != 'datetime64[ns]':
+                try:
+                    temp_df = dataset.copy()
+                    temp_df[date_col] = pd.to_datetime(temp_df[date_col])
+                except:
+                    st.error(f"Could not convert {date_col} to datetime.")
+                    return
+            else:
+                temp_df = dataset.copy()
+            # Select numeric column for time series
+            if numeric_cols:
+                value_col = st.selectbox("Select value column", numeric_cols)
+                # Aggregate by time period
+                time_period = st.selectbox(
+                    "Aggregate by",
+                    ["Day", "Week", "Month", "Quarter", "Year"]
+                )
+                # Set up time grouping
+                if time_period == "Day":
+                    temp_df['period'] = temp_df[date_col].dt.date
+                elif time_period == "Week":
+                    temp_df['period'] = temp_df[date_col].dt.to_period('W').dt.start_time
+                elif time_period == "Month":
+                    temp_df['period'] = temp_df[date_col].dt.to_period('M').dt.start_time
+                elif time_period == "Quarter":
+                    temp_df['period'] = temp_df[date_col].dt.to_period('Q').dt.start_time
+                else:  # Year
+                    temp_df['period'] = temp_df[date_col].dt.year
+                # Aggregate data
+                agg_method = st.selectbox("Aggregation method", ["Mean", "Sum", "Min", "Max", "Count"])
+                agg_map = {
+                    "Mean": "mean",
+                    "Sum": "sum",
+                    "Min": "min",
+                    "Max": "max",
+                    "Count": "count"
+                }
+                time_series = temp_df.groupby('period')[value_col].agg(agg_map[agg_method]).reset_index()
+                # Line chart
+                fig = px.line(
+                    time_series,
+                    x='period',
+                    y=value_col,
+                    title=f"{agg_method} of {value_col} by {time_period}",
+                    markers=True,
+                    color_discrete_sequence=["#2563EB"],
+                    template="simple_white"
+                )
+                fig.update_layout(
+                    xaxis_title=time_period,
+                    yaxis_title=f"{agg_method} of {value_col}"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+                # Show trendline option
+                if st.checkbox("Show trendline"):
+                    fig = px.scatter(
+                        time_series,
+                        x='period',
+                        y=value_col,
+                        trendline="ols",
+                        title=f"{agg_method} of {value_col} by {time_period} with Trendline",
+                        color_discrete_sequence=["#2563EB"],
+                        template="simple_white"
+                    )
+                    fig.update_layout(
+                        xaxis_title=time_period,
+                        yaxis_title=f"{agg_method} of {value_col}"
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                # Table view of time series data
+                st.dataframe(time_series, use_container_width=True)
+            else:
+                st.warning("No numeric columns found for time series values.")
+        else:
+            st.warning("No date columns found for time series visualization.")
+    elif viz_type == "Custom":
+        st.markdown("### Custom Visualization")
+        st.info("Create a custom plot by selecting axes and plot type")
+        # Select plot type
+        plot_type = st.selectbox(
+            "Select plot type",
+            ["Scatter", "Line", "Bar", "Box", "Violin", "Histogram", "Pie", "3D Scatter"]
+        )
+        # Depending on the plot type, get required axes
+        if plot_type in ["Scatter", "Line", "Bar", "3D Scatter"]:
+            # For scatter/line/bar, we need x and y
+            x_col = st.selectbox("X-axis", dataset.columns.tolist())
+            y_col = st.selectbox("Y-axis", numeric_cols if numeric_cols else dataset.columns.tolist())
+            # For 3D scatter, we need a z-axis
+            if plot_type == "3D Scatter":
+                z_col = st.selectbox("Z-axis", numeric_cols if numeric_cols else dataset.columns.tolist())
+            # Optional color dimension
+            use_color = st.checkbox("Add color dimension")
+            color_col = None
+            if use_color:
+                color_col = st.selectbox("Color by", dataset.columns.tolist())
+            # Create plot
+            if plot_type == "Scatter":
+                fig = px.scatter(
+                    dataset,
+                    x=x_col,
+                    y=y_col,
+                    color=color_col,
+                    title=f"{y_col} vs {x_col}",
+                    template="simple_white"
+                )
+            elif plot_type == "Line":
+                fig = px.line(
+                    dataset.sort_values(x_col),
+                    x=x_col,
+                    y=y_col,
+                    color=color_col,
+                    title=f"{y_col} vs {x_col}",
+                    template="simple_white"
+                )
+            elif plot_type == "Bar":
+                fig = px.bar(
+                    dataset,
+                    x=x_col,
+                    y=y_col,
+                    color=color_col,
+                    title=f"{y_col} by {x_col}",
+                    template="simple_white"
+                )
+            elif plot_type == "3D Scatter":
+                fig = px.scatter_3d(
+                    dataset,
+                    x=x_col,
+                    y=y_col,
+                    z=z_col,
+                    color=color_col,
+                    title=f"3D Scatter: {x_col}, {y_col}, {z_col}",
+                    template="simple_white"
+                )
+            st.plotly_chart(fig, use_container_width=True)
+        elif plot_type in ["Box", "Violin"]:
+            # For box/violin, we need x (categorical) and y (numeric)
+            x_col = st.selectbox("X-axis (categories)", categorical_cols if categorical_cols else dataset.columns.tolist())
+            y_col = st.selectbox("Y-axis (values)", numeric_cols if numeric_cols else dataset.columns.tolist())
+            # Optional color dimension
+            use_color = st.checkbox("Add color dimension")
+            color_col = None
+            if use_color:
+                color_col = st.selectbox("Color by", dataset.columns.tolist())
+            # Create plot
+            if plot_type == "Box":
+                fig = px.box(
+                    dataset,
+                    x=x_col,
+                    y=y_col,
+                    color=color_col,
+                    title=f"Box Plot: {y_col} by {x_col}",
+                    template="simple_white"
+                )
+            else:  # Violin
+                fig = px.violin(
+                    dataset,
+                    x=x_col,
+                    y=y_col,
+                    color=color_col,
+                    title=f"Violin Plot: {y_col} by {x_col}",
+                    template="simple_white"
+                )
+            st.plotly_chart(fig, use_container_width=True)
+        elif plot_type == "Histogram":
+            # For histogram, we need just one column
+            value_col = st.selectbox("Value column", dataset.columns.tolist())
+            # Bins option
+            n_bins = st.slider("Number of bins", 5, 100, 20)
+            # Optional color dimension
+            use_color = st.checkbox("Add color dimension")
+            color_col = None
+            if use_color:
+                color_col = st.selectbox("Color by", dataset.columns.tolist())
+            # Create plot
+            fig = px.histogram(
+                dataset,
+                x=value_col,
+                color=color_col,
+                nbins=n_bins,
+                title=f"Histogram of {value_col}",
+                template="simple_white"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        elif plot_type == "Pie":
+            # For pie, we need a categorical column
+            cat_col = st.selectbox("Category column", categorical_cols if categorical_cols else dataset.columns.tolist())
+            # Optional value column
+            use_values = st.checkbox("Use custom values")
+            value_col = None
+            if use_values and numeric_cols:
+                value_col = st.selectbox("Value column", numeric_cols)
+            # Limit to top N categories if there are too many
+            top_n = st.slider("Limit to top N categories", 0, 20, 10,
+                help="Set to 0 to show all categories. Recommended to limit to top 10-15 categories for readability.")
+            # Process data for pie chart
+            if top_n > 0:
+                if use_values and value_col:
+                    pie_data = dataset.groupby(cat_col)[value_col].sum().reset_index()
+                    pie_data = pie_data.sort_values(value_col, ascending=False).head(top_n)
+                else:
+                    value_counts = dataset[cat_col].value_counts().reset_index()
+                    value_counts.columns = [cat_col, 'count']
+                    pie_data = value_counts.head(top_n)
+                    value_col = 'count'
+            else:
+                if use_values and value_col:
+                    pie_data = dataset.groupby(cat_col)[value_col].sum().reset_index()
+                else:
+                    value_counts = dataset[cat_col].value_counts().reset_index()
+                    value_counts.columns = [cat_col, 'count']
+                    pie_data = value_counts
+                    value_col = 'count'
+            # Create plot
+            fig = px.pie(
+                pie_data,
+                names=cat_col,
+                values=value_col,
+                title=f"Pie Chart of {cat_col}",
+                template="simple_white"
+            )
+            st.plotly_chart(fig, use_container_width=True)

components/fine_tuning/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Fine-tuning package for code generation models.
+"""

components/fine_tuning/finetune_ui.py ADDED Viewed

	@@ -0,0 +1,529 @@

+"""
+Streamlit UI for fine-tuning code generation models.
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import os
+import time
+from datetime import datetime
+import torch
+import plotly.express as px
+import plotly.graph_objects as go
+from pathlib import Path
+import json
+import uuid
+import threading
+from transformers import TrainingArguments
+from datasets import Dataset
+from components.fine_tuning.model_interface import (
+    load_model_and_tokenizer,
+    preprocess_code_dataset,
+    setup_trainer,
+    generate_code_comment,
+    generate_code_from_comment,
+    save_training_config,
+    load_training_config
+)
+# Initialize training state
+if 'training_run_id' not in st.session_state:
+    st.session_state.training_run_id = None
+if 'training_status' not in st.session_state:
+    st.session_state.training_status = "idle"  # idle, running, completed, failed
+if 'training_progress' not in st.session_state:
+    st.session_state.training_progress = 0.0
+if 'trained_model' not in st.session_state:
+    st.session_state.trained_model = None
+if 'trained_tokenizer' not in st.session_state:
+    st.session_state.trained_tokenizer = None
+if 'training_logs' not in st.session_state:
+    st.session_state.training_logs = []
+if 'fine_tuning_dataset' not in st.session_state:
+    st.session_state.fine_tuning_dataset = None
+# Directory for saving models
+MODELS_DIR = Path("./fine_tuned_models")
+MODELS_DIR.mkdir(exist_ok=True)
+# Set for background training thread
+training_thread = None
+def render_dataset_preparation():
+    """
+    Render the dataset preparation interface.
+    """
+    st.markdown("### Dataset Preparation")
+    # Dataset input options
+    dataset_source = st.radio(
+        "Choose dataset source",
+        ["Upload CSV", "Manual Input", "Use Current Dataset"],
+        help="Select how you want to provide your fine-tuning dataset"
+    )
+    if dataset_source == "Upload CSV":
+        uploaded_file = st.file_uploader(
+            "Upload fine-tuning dataset (CSV)",
+            type=["csv"],
+            help="CSV file with 'input' and 'target' columns"
+        )
+        if uploaded_file is not None:
+            try:
+                df = pd.read_csv(uploaded_file)
+                # Check if required columns exist
+                if "input" not in df.columns or "target" not in df.columns:
+                    st.error("CSV must contain 'input' and 'target' columns.")
+                    return
+                # Preview dataset
+                st.markdown("### Dataset Preview")
+                st.dataframe(df.head(), use_container_width=True)
+                # Dataset statistics
+                st.markdown("### Dataset Statistics")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Number of examples", len(df))
+                with col2:
+                    st.metric("Average input length", df["input"].astype(str).str.len().mean().round(1))
+                # Save dataset
+                if st.button("Use this dataset"):
+                    st.session_state.fine_tuning_dataset = df
+                    st.success(f"Dataset with {len(df)} examples loaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading CSV: {str(e)}")
+    elif dataset_source == "Manual Input":
+        st.markdown("""
+        Enter pairs of inputs and targets for fine-tuning. For code-to-comment tasks, the input is code and
+        the target is a comment. For comment-to-code tasks, the input is a comment and the target is code.
+        """)
+        # Container for input fields
+        examples_container = st.container()
+        # Default number of example fields
+        if "num_examples" not in st.session_state:
+            st.session_state.num_examples = 3
+        # Add more examples button
+        if st.button("Add another example"):
+            st.session_state.num_examples += 1
+        # Input fields for examples
+        inputs = []
+        targets = []
+        with examples_container:
+            for i in range(st.session_state.num_examples):
+                st.markdown(f"### Example {i+1}")
+                col1, col2 = st.columns(2)
+                with col1:
+                    input_text = st.text_area(f"Input {i+1}", key=f"input_{i}", height=150)
+                    inputs.append(input_text)
+                with col2:
+                    target_text = st.text_area(f"Target {i+1}", key=f"target_{i}", height=150)
+                    targets.append(target_text)
+        # Create dataset from manual input
+        if st.button("Create Dataset from Examples"):
+            # Filter out empty examples
+            valid_examples = [(inp, tgt) for inp, tgt in zip(inputs, targets) if inp.strip() and tgt.strip()]
+            if valid_examples:
+                df = pd.DataFrame(valid_examples, columns=["input", "target"])
+                st.session_state.fine_tuning_dataset = df
+                # Preview dataset
+                st.markdown("### Dataset Preview")
+                st.dataframe(df, use_container_width=True)
+                st.success(f"Dataset with {len(df)} examples created successfully!")
+            else:
+                st.warning("No valid examples found. Please enter at least one input-target pair.")
+    elif dataset_source == "Use Current Dataset":
+        if st.session_state.dataset is None:
+            st.warning("No dataset is currently loaded. Please upload or select a dataset first.")
+        else:
+            st.markdown("### Current Dataset")
+            st.dataframe(st.session_state.dataset.head(), use_container_width=True)
+            # Column selection
+            col1, col2 = st.columns(2)
+            with col1:
+                input_col = st.selectbox("Select column for inputs", st.session_state.dataset.columns)
+            with col2:
+                target_col = st.selectbox("Select column for targets", st.session_state.dataset.columns)
+            # Create fine-tuning dataset
+            if st.button("Create Fine-Tuning Dataset"):
+                df = st.session_state.dataset[[input_col, target_col]].copy()
+                df.columns = ["input", "target"]
+                # Verify data types and convert to string if necessary
+                df["input"] = df["input"].astype(str)
+                df["target"] = df["target"].astype(str)
+                # Preview
+                st.dataframe(df.head(), use_container_width=True)
+                # Store dataset
+                st.session_state.fine_tuning_dataset = df
+                st.success(f"Fine-tuning dataset with {len(df)} examples created successfully!")
+def render_model_training():
+    """
+    Render the model training interface.
+    """
+    st.markdown("### Model Training")
+    # Check if dataset is available
+    if st.session_state.fine_tuning_dataset is None:
+        st.warning("Please prepare a dataset in the 'Dataset Preparation' tab first.")
+        return
+    # Model selection
+    model_options = {
+        "Salesforce/codet5-small": "CodeT5 Small (60M params)",
+        "Salesforce/codet5-base": "CodeT5 Base (220M params)",
+        "Salesforce/codet5-large": "CodeT5 Large (770M params)",
+        "microsoft/codebert-base": "CodeBERT Base (125M params)",
+        "facebook/bart-base": "BART Base (140M params)"
+    }
+    model_name = st.selectbox(
+        "Select pre-trained model",
+        list(model_options.keys()),
+        format_func=lambda x: model_options[x],
+        help="Select the base model for fine-tuning"
+    )
+    # Task type
+    task_type = st.selectbox(
+        "Select task type",
+        ["Code to Comment", "Comment to Code"],
+        help="Choose the direction of your task"
+    )
+    # Task prefix
+    if task_type == "Code to Comment":
+        task_prefix = "translate code to comment: "
+    else:
+        task_prefix = "translate comment to code: "
+    # Hyperparameters
+    st.markdown("### Training Hyperparameters")
+    col1, col2 = st.columns(2)
+    with col1:
+        learning_rate = st.select_slider(
+            "Learning Rate",
+            options=[1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4],
+            value=5e-5,
+            help="Step size for optimizer updates"
+        )
+        epochs = st.slider(
+            "Epochs",
+            min_value=1,
+            max_value=20,
+            value=3,
+            help="Number of complete passes through the dataset"
+        )
+    with col2:
+        batch_size = st.select_slider(
+            "Batch Size",
+            options=[1, 2, 4, 8, 16, 32],
+            value=8,
+            help="Number of examples processed in each training step"
+        )
+        max_input_length = st.slider(
+            "Max Input Length (tokens)",
+            min_value=64,
+            max_value=512,
+            value=256,
+            help="Maximum length of input sequences"
+        )
+    # Advanced options
+    with st.expander("Advanced Options"):
+        col1, col2 = st.columns(2)
+        with col1:
+            weight_decay = st.select_slider(
+                "Weight Decay",
+                options=[0.0, 0.01, 0.05, 0.1],
+                value=0.01,
+                help="L2 regularization"
+            )
+            warmup_steps = st.slider(
+                "Warmup Steps",
+                min_value=0,
+                max_value=1000,
+                value=100,
+                help="Steps for learning rate warmup"
+            )
+        with col2:
+            max_target_length = st.slider(
+                "Max Target Length (tokens)",
+                min_value=64,
+                max_value=512,
+                value=256,
+                help="Maximum length of target sequences"
+            )
+            gradient_accumulation = st.slider(
+                "Gradient Accumulation Steps",
+                min_value=1,
+                max_value=16,
+                value=1,
+                help="Number of steps to accumulate gradients"
+            )
+    # Model output configuration
+    st.markdown("### Model Output Configuration")
+    model_name_custom = st.text_input(
+        "Custom model name",
+        value=f"{model_name.split('/')[-1]}-finetuned-{task_type.lower().replace(' ', '-')}",
+        help="Name for your fine-tuned model"
+    )
+    # Training controls
+    st.markdown("### Training Controls")
+    # Check if training is in progress
+    if st.session_state.training_status == "running":
+        # Display progress
+        st.progress(st.session_state.training_progress)
+        # Show logs
+        if st.session_state.training_logs:
+            st.markdown("### Training Logs")
+            log_text = "\n".join(st.session_state.training_logs[-10:])  # Show last 10 logs
+            st.text_area("Latest logs", log_text, height=200, disabled=True)
+        # Stop button
+        if st.button("Stop Training"):
+            # Logic to stop training thread
+            st.session_state.training_status = "stopping"
+            st.warning("Stopping training after current epoch completes...")
+    elif st.session_state.training_status == "completed":
+        st.success(f"Training completed! Model saved as: {model_name_custom}")
+        # Show metrics if available
+        if "training_metrics" in st.session_state:
+            st.markdown("### Training Metrics")
+            metrics_df = pd.DataFrame(st.session_state.training_metrics)
+            st.line_chart(metrics_df)
+        # Reset button
+        if st.button("Start New Training"):
+            st.session_state.training_status = "idle"
+            st.session_state.training_progress = 0.0
+            st.session_state.training_logs = []
+            st.experimental_rerun()
+    else:  # idle or failed
+        # If previously failed, show error
+        if st.session_state.training_status == "failed":
+            st.error("Previous training failed. See logs for details.")
+            if st.session_state.training_logs:
+                st.text_area("Error logs", "\n".join(st.session_state.training_logs[-5:]), height=100, disabled=True)
+        # Start training button
+        if st.button("Start Training"):
+            # Validate dataset
+            if len(st.session_state.fine_tuning_dataset) < 5:
+                st.warning("Dataset is very small. Consider adding more examples for better results.")
+            # Set up training configuration
+            training_config = {
+                "model_name": model_name,
+                "task_type": task_type,
+                "task_prefix": task_prefix,
+                "learning_rate": learning_rate,
+                "epochs": epochs,
+                "batch_size": batch_size,
+                "max_input_length": max_input_length,
+                "max_target_length": max_target_length,
+                "weight_decay": weight_decay,
+                "warmup_steps": warmup_steps,
+                "gradient_accumulation": gradient_accumulation,
+                "output_model_name": model_name_custom,
+                "dataset_size": len(st.session_state.fine_tuning_dataset)
+            }
+            # Update session state
+            st.session_state.training_status = "running"
+            st.session_state.training_progress = 0.0
+            st.session_state.training_logs = ["Training initialized..."]
+            st.session_state.training_run_id = str(uuid.uuid4())
+            # TODO: Start actual training process using transformers
+            st.info("Training would start here with the Hugging Face transformers library")
+            # For now, just simulate training progress
+            st.session_state.training_progress = 0.1
+            st.session_state.training_logs.append("Loaded model and tokenizer")
+            st.session_state.training_logs.append("Preprocessing dataset...")
+            # Rerun to update UI with progress
+            st.experimental_rerun()
+def render_model_testing():
+    """
+    Render the model testing interface.
+    """
+    st.markdown("### Test & Use Model")
+    # Check if a model is trained/available
+    if st.session_state.trained_model is None and st.session_state.training_status != "completed":
+        # Look for saved models
+        saved_models = list(MODELS_DIR.glob("*/"))
+        if not saved_models:
+            st.warning("No trained models available. Please train a model first.")
+            return
+        # Let user select a saved model
+        model_options = [model.name for model in saved_models]
+        selected_model = st.selectbox("Select a saved model", model_options)
+        if st.button("Load Selected Model"):
+            st.info(f"Loading model {selected_model}...")
+            # TODO: Load model logic
+            st.session_state.trained_model = "loaded"  # Placeholder
+            st.session_state.trained_tokenizer = "loaded"  # Placeholder
+            st.success("Model loaded successfully!")
+    else:
+        # Model is available for testing
+        model_type = "Code to Comment" if "code-to-comment" in st.session_state.get("model_name", "") else "Comment to Code"
+        st.markdown(f"### Testing {model_type} Generation")
+        if model_type == "Code to Comment":
+            input_text = st.text_area(
+                "Enter code snippet",
+                height=200,
+                help="Enter a code snippet to generate a comment"
+            )
+            if st.button("Generate Comment"):
+                if input_text:
+                    with st.spinner("Generating comment..."):
+                        # TODO: Replace with actual model inference
+                        result = f"/* This code {input_text.split()[0:3]} ... */"
+                        st.markdown("### Generated Comment")
+                        st.code(result)
+                else:
+                    st.warning("Please enter a code snippet.")
+        else:  # Comment to Code
+            input_text = st.text_area(
+                "Enter comment/description",
+                height=150,
+                help="Enter a description to generate code"
+            )
+            language = st.selectbox(
+                "Programming language",
+                ["Python", "JavaScript", "Java", "C++", "Go"]
+            )
+            if st.button("Generate Code"):
+                if input_text:
+                    with st.spinner("Generating code..."):
+                        # TODO: Replace with actual model inference
+                        result = f"def example_function():\n    # {input_text}\n    pass"
+                        st.markdown("### Generated Code")
+                        st.code(result, language=language.lower())
+                else:
+                    st.warning("Please enter a comment or description.")
+        # Batch testing
+        with st.expander("Batch Testing"):
+            st.markdown("Upload a CSV file with test cases to evaluate your model.")
+            test_file = st.file_uploader(
+                "Upload test cases (CSV)",
+                type=["csv"],
+                help="CSV file with 'input' and 'expected' columns"
+            )
+            if test_file is not None:
+                try:
+                    test_df = pd.read_csv(test_file)
+                    st.dataframe(test_df.head(), use_container_width=True)
+                    if st.button("Run Batch Test"):
+                        with st.spinner("Running tests..."):
+                            # TODO: Actual batch inference
+                            st.success("Batch testing completed!")
+                            # Dummy results
+                            results = pd.DataFrame({
+                                "input": test_df["input"],
+                                "expected": test_df.get("expected", [""] * len(test_df)),
+                                "generated": ["Sample output " + str(i) for i in range(len(test_df))],
+                                "match_score": np.random.uniform(0.5, 1.0, len(test_df))
+                            })
+                            st.dataframe(results, use_container_width=True)
+                            # Metrics
+                            st.markdown("### Evaluation Metrics")
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.metric("Average Match Score", f"{results['match_score'].mean():.2f}")
+                            with col2:
+                                st.metric("Tests Passed", f"{sum(results['match_score'] > 0.8)}/{len(results)}")
+                except Exception as e:
+                    st.error(f"Error loading test file: {str(e)}")
+def render_finetune_ui():
+    """
+    Render the fine-tuning UI for code generation models.
+    """
+    st.markdown("<h2>Fine-Tune Code Generation Model</h2>", unsafe_allow_html=True)
+    # Overview and instructions
+    with st.expander("About Fine-Tuning", expanded=False):
+        st.markdown("""
+        ## Fine-Tuning a Code Generation Model
+        This interface allows you to fine-tune pre-trained code generation models from Hugging Face
+        on your custom dataset to adapt them to your specific coding style or task.
+        ### How to use:
+        1. **Prepare your dataset** - Upload a CSV file with 'input' and 'target' columns:
+           - For code-to-comment: 'input' = code snippets, 'target' = corresponding comments
+           - For comment-to-code: 'input' = comments, 'target' = corresponding code snippets
+        2. **Configure training** - Set hyperparameters like learning rate, batch size, and epochs
+        3. **Start fine-tuning** - Launch the training process and monitor progress
+        4. **Test your model** - Once training is complete, test your model on new inputs
+        ### Tips for better results:
+        - Use a consistent format for your code snippets and comments
+        - Start with a small dataset (50-100 examples) to verify the process
+        - Try different hyperparameters to find the best configuration
+        """)
+    # Main UI with tabs
+    tab1, tab2, tab3 = st.tabs(["Dataset Preparation", "Model Training", "Test & Use Model"])
+    with tab1:
+        render_dataset_preparation()
+    with tab2:
+        render_model_training()
+    with tab3:
+        render_model_testing()

components/fine_tuning/model_interface.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Hugging Face model interface for code generation fine-tuning.
+"""
+import streamlit as st
+import pandas as pd
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForSeq2Seq,
+)
+from datasets import Dataset
+import numpy as np
+import time
+import os
+from pathlib import Path
+import uuid
+import json
+@st.cache_resource(show_spinner=False)
+def load_model_and_tokenizer(model_name):
+    """
+    Load a pre-trained model and tokenizer from Hugging Face.
+    Args:
+        model_name: Name of the model on Hugging Face (e.g., 'Salesforce/codet5-base')
+    Returns:
+        Tuple of (tokenizer, model)
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    return tokenizer, model
+def preprocess_code_dataset(dataset_df, tokenizer, max_input_length=256, max_target_length=256, task_prefix=""):
+    """
+    Preprocess the code dataset for fine-tuning.
+    Args:
+        dataset_df: Pandas DataFrame with 'input' and 'target' columns
+        tokenizer: HuggingFace tokenizer
+        max_input_length: Maximum length for input sequences
+        max_target_length: Maximum length for target sequences
+        task_prefix: Prefix to add to inputs (e.g., "translate code to comment: ")
+    Returns:
+        HuggingFace Dataset ready for training
+    """
+    def preprocess_function(examples):
+        inputs = [task_prefix + text for text in examples["input"]]
+        targets = examples["target"]
+        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
+        # Set up the tokenizer for targets
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+    # Convert DataFrame to HuggingFace Dataset
+    hf_dataset = Dataset.from_pandas(dataset_df)
+    # Split dataset into train and validation
+    splits = hf_dataset.train_test_split(test_size=0.1)
+    train_dataset = splits["train"]
+    eval_dataset = splits["test"]
+    # Apply preprocessing
+    train_dataset = train_dataset.map(
+        preprocess_function,
+        batched=True,
+        remove_columns=["input", "target"]
+    )
+    eval_dataset = eval_dataset.map(
+        preprocess_function,
+        batched=True,
+        remove_columns=["input", "target"]
+    )
+    return train_dataset, eval_dataset
+def setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir, training_args):
+    """
+    Set up the Trainer for fine-tuning.
+    Args:
+        model: HuggingFace model
+        tokenizer: HuggingFace tokenizer
+        train_dataset: Preprocessed training dataset
+        eval_dataset: Preprocessed evaluation dataset
+        output_dir: Directory to save model and checkpoints
+        training_args: Dictionary of training arguments
+    Returns:
+        HuggingFace Trainer
+    """
+    # Define training arguments
+    args = TrainingArguments(
+        output_dir=output_dir,
+        per_device_train_batch_size=training_args.get("batch_size", 8),
+        per_device_eval_batch_size=training_args.get("batch_size", 8),
+        learning_rate=training_args.get("learning_rate", 5e-5),
+        num_train_epochs=training_args.get("epochs", 3),
+        weight_decay=training_args.get("weight_decay", 0.01),
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        push_to_hub=False,
+        gradient_accumulation_steps=training_args.get("gradient_accumulation", 1),
+        warmup_steps=training_args.get("warmup_steps", 100),
+        logging_dir=os.path.join(output_dir, "logs"),
+        logging_steps=10,
+    )
+    # Data collator
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=tokenizer.pad_token_id,
+        pad_to_multiple_of=8
+    )
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        args=args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    return trainer
+def generate_code_comment(model, tokenizer, code, max_length=100, task_prefix="translate code to comment: "):
+    """
+    Generate a comment for a given code snippet.
+    Args:
+        model: Fine-tuned model
+        tokenizer: Tokenizer
+        code: Input code snippet
+        max_length: Maximum length of the generated comment
+        task_prefix: Prefix to add to the input
+    Returns:
+        Generated comment as string
+    """
+    inputs = tokenizer(task_prefix + code, return_tensors="pt", padding=True, truncation=True)
+    # Move inputs to the same device as model
+    device = model.device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate
+    output_ids = model.generate(
+        inputs["input_ids"],
+        max_length=max_length,
+        num_beams=4,
+        early_stopping=True
+    )
+    comment = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return comment
+def generate_code_from_comment(model, tokenizer, comment, max_length=200, task_prefix="translate comment to code: "):
+    """
+    Generate code from a given comment/description.
+    Args:
+        model: Fine-tuned model
+        tokenizer: Tokenizer
+        comment: Input comment or description
+        max_length: Maximum length of the generated code
+        task_prefix: Prefix to add to the input
+    Returns:
+        Generated code as string
+    """
+    inputs = tokenizer(task_prefix + comment, return_tensors="pt", padding=True, truncation=True)
+    # Move inputs to the same device as model
+    device = model.device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate
+    output_ids = model.generate(
+        inputs["input_ids"],
+        max_length=max_length,
+        num_beams=4,
+        early_stopping=True
+    )
+    code = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return code
+def save_training_config(output_dir, config):
+    """
+    Save training configuration to a JSON file.
+    Args:
+        output_dir: Directory to save the configuration
+        config: Dictionary with training configuration
+    """
+    config_path = os.path.join(output_dir, "training_config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+def load_training_config(model_dir):
+    """
+    Load training configuration from a JSON file.
+    Args:
+        model_dir: Directory with the saved model
+    Returns:
+        Dictionary with training configuration
+    """
+    config_path = os.path.join(model_dir, "training_config.json")
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            return json.load(f)
+    return {}

generated-icon.png ADDED Viewed

Git LFS Details

SHA256: 4d5eb5d7587b63984ad9a2ba3d6a09dda0feba809e7cdb0bc047f9c58314a04e
Pointer size: 131 Bytes
Size of remote file: 140 kB

huggingface-spacefile ADDED Viewed

	@@ -0,0 +1,8 @@

+title: ML Dataset & Code Generation Manager
+emoji: 🤗
+colorFrom: indigo
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.42.0
+app_file: app.py
+pinned: false

main.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import streamlit as st
+import os
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import json
+from pathlib import Path
+# Make sure necessary directories exist
+os.makedirs('assets', exist_ok=True)
+os.makedirs('database/data', exist_ok=True)
+os.makedirs('fine_tuned_models', exist_ok=True)
+# Page configuration
+st.set_page_config(
+    page_title="ML Dataset & Code Generation Manager",
+    page_icon="🤗",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+def load_css():
+    """Load custom CSS styles"""
+    css_dir = Path("assets")
+    css_path = css_dir / "custom.css"
+    if not css_path.exists():
+        # Create assets directory if it doesn't exist
+        css_dir.mkdir(exist_ok=True)
+        # Create a basic CSS file if it doesn't exist
+        with open(css_path, "w") as f:
+            f.write("""
+            /* Custom styles for ML Dataset & Code Generation Manager */
+            @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Space+Grotesk:wght@500;700&display=swap');
+            h1, h2, h3, h4, h5, h6 {
+                font-family: 'Space Grotesk', sans-serif;
+                font-weight: 700;
+                color: #1A1C1F;
+            }
+            body {
+                font-family: 'Inter', sans-serif;
+                color: #1A1C1F;
+                background-color: #F8F9FA;
+            }
+            .stButton button {
+                background-color: #2563EB;
+                color: white;
+                border-radius: 4px;
+                border: none;
+                padding: 0.5rem 1rem;
+                font-weight: 600;
+            }
+            .stButton button:hover {
+                background-color: #1D4ED8;
+            }
+            /* Card styling */
+            .card {
+                background-color: white;
+                border-radius: 8px;
+                padding: 1.5rem;
+                box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+                margin-bottom: 1rem;
+            }
+            /* Accent colors */
+            .accent-primary {
+                color: #2563EB;
+            }
+            .accent-secondary {
+                color: #84919A;
+            }
+            .accent-success {
+                color: #10B981;
+            }
+            .accent-warning {
+                color: #F59E0B;
+            }
+            .accent-danger {
+                color: #EF4444;
+            }
+            """)
+    # Load custom CSS
+    with open(css_path, "r") as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+def render_finetune_ui():
+    """
+    Renders the fine-tuning UI for code generation models.
+    """
+    try:
+        from components.fine_tuning.finetune_ui import render_finetune_ui as ft_ui
+        ft_ui()
+    except ImportError as e:
+        st.error(f"Could not load fine-tuning UI: {e}")
+        # Create default fine-tuning UI component if not exists
+        os.makedirs("components/fine_tuning", exist_ok=True)
+        if not os.path.exists("components/fine_tuning/__init__.py"):
+            with open("components/fine_tuning/__init__.py", "w") as f:
+                f.write('"""\nFine-tuning package for code generation models.\n"""\n')
+        if not os.path.exists("components/fine_tuning/finetune_ui.py"):
+            with open("components/fine_tuning/finetune_ui.py", "w") as f:
+                f.write('''"""
+Streamlit UI for fine-tuning code generation models.
+"""
+import streamlit as st
+import pandas as pd
+import os
+def render_dataset_preparation():
+    """
+    Render the dataset preparation interface.
+    """
+    st.subheader("Dataset Preparation")
+    st.write("Prepare your dataset for fine-tuning code generation models.")
+    # Dataset upload
+    uploaded_file = st.file_uploader("Upload your dataset", type=["csv", "json"])
+    if uploaded_file is not None:
+        try:
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
+            else:
+                df = pd.read_json(uploaded_file)
+            st.write("Dataset Preview:")
+            st.dataframe(df.head())
+            # Example of data columns mapping
+            st.subheader("Column Mapping")
+            input_col = st.selectbox("Select input column (e.g., code)", df.columns)
+            target_col = st.selectbox("Select target column (e.g., comment)", df.columns)
+            # Sample transformation
+            if st.button("Apply Transformation"):
+                if input_col and target_col:
+                    # Example transformation: simple trim/clean
+                    df[input_col] = df[input_col].astype(str).str.strip()
+                    df[target_col] = df[target_col].astype(str).str.strip()
+                    st.write("Transformed Dataset:")
+                    st.dataframe(df.head())
+                    # Option to save processed dataset
+                    if st.button("Save Processed Dataset"):
+                        processed_path = os.path.join("datasets", "processed_dataset.csv")
+                        os.makedirs("datasets", exist_ok=True)
+                        df.to_csv(processed_path, index=False)
+                        st.success(f"Dataset saved to {processed_path}")
+        except Exception as e:
+            st.error(f"Error processing dataset: {e}")
+def render_model_training():
+    """
+    Render the model training interface.
+    """
+    st.subheader("Model Training")
+    st.write("Configure and start training your model.")
+    # Model selection
+    model_options = [
+        "Salesforce/codet5-small",
+        "Salesforce/codet5-base",
+        "microsoft/codebert-base",
+        "microsoft/graphcodebert-base"
+    ]
+    selected_model = st.selectbox("Select base model", model_options)
+    # Training parameters
+    col1, col2 = st.columns(2)
+    with col1:
+        batch_size = st.number_input("Batch size", min_value=1, max_value=64, value=8)
+        epochs = st.number_input("Number of epochs", min_value=1, max_value=100, value=3)
+        learning_rate = st.number_input("Learning rate", min_value=0.00001, max_value=0.1, value=0.0001, format="%.5f")
+    with col2:
+        max_input_length = st.number_input("Max input length", min_value=32, max_value=512, value=128)
+        max_target_length = st.number_input("Max target length", min_value=32, max_value=512, value=128)
+        task_type = st.selectbox("Task type", ["Code to Comment", "Comment to Code"])
+    # Training button (placeholder)
+    if st.button("Start Training"):
+        st.info("Training would start here. This is a placeholder.")
+        # In a real implementation, this would call the training function
+        # and display a progress bar or redirect to a training monitoring page
+def render_model_testing():
+    """
+    Render the model testing interface.
+    """
+    st.subheader("Model Testing")
+    st.write("Test your fine-tuned model with custom inputs.")
+    # Model selection
+    st.selectbox("Select fine-tuned model", ["No models available yet"])
+    # Test input
+    if st.selectbox("Task type", ["Code to Comment", "Comment to Code"]) == "Code to Comment":
+        test_input = st.text_area("Enter code to generate a comment",
+                                  value="def fibonacci(n):\\n    if n <= 1:\\n        return n\\n    else:\\n        return fibonacci(n-1) + fibonacci(n-2)")
+        placeholder = "# This function implements the Fibonacci sequence recursively..."
+    else:
+        test_input = st.text_area("Enter comment to generate code",
+                                 value="# A function that calculates the factorial of a number recursively")
+        placeholder = "def factorial(n):\\n    if n == 0:\\n        return 1\\n    else:\\n        return n * factorial(n-1)"
+    # Generate button (placeholder)
+    if st.button("Generate"):
+        st.code(placeholder, language="python")
+        # In a real implementation, this would call the model inference function
+def render_finetune_ui():
+    """
+    Render the fine-tuning UI for code generation models.
+    """
+    st.title("Fine-Tune Code Generation Models")
+    tabs = st.tabs(["Dataset Preparation", "Model Training", "Model Testing"])
+    with tabs[0]:
+        render_dataset_preparation()
+    with tabs[1]:
+        render_model_training()
+    with tabs[2]:
+        render_model_testing()
+''')
+        # Try again after creating the files
+        try:
+            from components.fine_tuning.finetune_ui import render_finetune_ui as ft_ui
+            ft_ui()
+        except ImportError as e:
+            st.error(f"Still could not load fine-tuning UI after creating files: {e}")
+            st.info("Please restart the app to initialize the components.")
+def render_code_quality_ui():
+    """
+    Renders the code quality tools UI.
+    """
+    try:
+        from components.code_quality import render_code_quality_tools
+        render_code_quality_tools()
+    except ImportError:
+        st.error("Code quality tools not found. Implementing basic version.")
+        st.title("Code Quality Tools")
+        st.write("This section will provide tools for code linting, formatting, and testing.")
+        # Tabs for different code quality tools
+        tabs = st.tabs(["Linting", "Formatting", "Type Checking", "Testing"])
+        with tabs[0]:
+            st.subheader("Code Linting")
+            st.write("Tools for checking code quality and style.")
+            st.code("# Coming soon: PyLint and Flake8 integration")
+        with tabs[1]:
+            st.subheader("Code Formatting")
+            st.write("Tools for formatting code according to style guides.")
+            st.code("# Coming soon: Black and isort integration")
+        with tabs[2]:
+            st.subheader("Type Checking")
+            st.write("Tools for checking type annotations.")
+            st.code("# Coming soon: MyPy integration")
+        with tabs[3]:
+            st.subheader("Testing")
+            st.write("Tools for running tests and checking code coverage.")
+            st.code("# Coming soon: PyTest integration")
+def render_dataset_management_ui():
+    """
+    Renders the dataset management UI.
+    """
+    st.title("Dataset Management")
+    # Tabs for different dataset operations
+    tabs = st.tabs(["Upload", "Preview", "Statistics", "Visualization", "Validation", "Version Control"])
+    with tabs[0]:
+        try:
+            from components.dataset_uploader import render_dataset_uploader
+            render_dataset_uploader()
+        except ImportError:
+            st.subheader("Dataset Upload")
+            st.write("Upload your datasets in CSV or JSON format.")
+            uploaded_file = st.file_uploader("Choose a file", type=["csv", "json"])
+            if uploaded_file is not None:
+                try:
+                    if uploaded_file.name.endswith('.csv'):
+                        df = pd.read_csv(uploaded_file)
+                        dataset_type = "csv"
+                    else:
+                        df = pd.read_json(uploaded_file)
+                        dataset_type = "json"
+                    st.session_state["dataset"] = df
+                    st.session_state["dataset_type"] = dataset_type
+                    st.success(f"Successfully loaded {dataset_type.upper()} file with {df.shape[0]} rows and {df.shape[1]} columns.")
+                    st.dataframe(df.head())
+                except Exception as e:
+                    st.error(f"Error: {e}")
+    with tabs[1]:
+        if "dataset" in st.session_state:
+            try:
+                from components.dataset_preview import render_dataset_preview
+                render_dataset_preview(st.session_state["dataset"], st.session_state["dataset_type"])
+            except ImportError:
+                st.subheader("Dataset Preview")
+                st.dataframe(st.session_state["dataset"].head(10))
+        else:
+            st.info("Please upload a dataset first.")
+    with tabs[2]:
+        if "dataset" in st.session_state:
+            try:
+                from components.dataset_statistics import render_dataset_statistics
+                render_dataset_statistics(st.session_state["dataset"], st.session_state["dataset_type"])
+            except ImportError:
+                st.subheader("Dataset Statistics")
+                st.write("Basic statistics:")
+                st.write(st.session_state["dataset"].describe())
+                # Missing values
+                missing_data = st.session_state["dataset"].isnull().sum()
+                st.write("Missing values per column:")
+                st.write(missing_data[missing_data > 0])
+        else:
+            st.info("Please upload a dataset first.")
+    with tabs[3]:
+        if "dataset" in st.session_state:
+            try:
+                from components.dataset_visualization import render_dataset_visualization
+                render_dataset_visualization(st.session_state["dataset"], st.session_state["dataset_type"])
+            except ImportError:
+                st.subheader("Dataset Visualization")
+                # Only show for numerical columns
+                numeric_cols = st.session_state["dataset"].select_dtypes(include=[np.number]).columns.tolist()
+                if len(numeric_cols) > 0:
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        x_axis = st.selectbox("X-axis", numeric_cols)
+                    with col2:
+                        y_axis = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
+                    fig = px.scatter(st.session_state["dataset"], x=x_axis, y=y_axis)
+                    st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.write("No numerical columns available for visualization.")
+        else:
+            st.info("Please upload a dataset first.")
+    with tabs[4]:
+        if "dataset" in st.session_state:
+            try:
+                from components.dataset_validation import render_dataset_validation
+                render_dataset_validation(st.session_state["dataset"], st.session_state["dataset_type"])
+            except ImportError:
+                st.subheader("Dataset Validation")
+                # Simple validation checks
+                st.write("Dataset Shape:", st.session_state["dataset"].shape)
+                st.write("Duplicate Rows:", st.session_state["dataset"].duplicated().sum())
+                # Missing values percentage
+                missing_percent = (st.session_state["dataset"].isnull().sum() / len(st.session_state["dataset"])) * 100
+                st.write("Missing Values Percentage:")
+                st.write(missing_percent[missing_percent > 0])
+        else:
+            st.info("Please upload a dataset first.")
+    with tabs[5]:
+        if "dataset" in st.session_state:
+            try:
+                from components.dataset_version_control import render_version_control_ui, render_save_version_ui, render_version_visualization
+                # If we have a dataset ID in session state, use it, otherwise prompt to save first
+                if "dataset_id" in st.session_state:
+                    dataset_id = st.session_state["dataset_id"]
+                    # Show dataset version control UI
+                    render_version_control_ui(dataset_id, st.session_state.get("dataset"))
+                    # Show save version UI
+                    st.divider()
+                    if st.session_state.get("dataset") is not None:
+                        new_version = render_save_version_ui(dataset_id, st.session_state["dataset"])
+                        if new_version:
+                            st.success(f"Created new version: {new_version.version_id}")
+                    # Show version visualization
+                    st.divider()
+                    render_version_visualization(dataset_id)
+                else:
+                    # No dataset ID yet, so prompt to save the dataset first
+                    st.info("To use version control, first save this dataset to the database.")
+                    dataset_name = st.text_input("Dataset Name", value="My Dataset")
+                    dataset_description = st.text_area("Dataset Description", value="Dataset uploaded for analysis")
+                    if st.button("Save Dataset to Database"):
+                        # Import database operations
+                        from database.operations import DatasetOperations, DatasetVersionOperations
+                        # Store dataset in database
+                        dataset = DatasetOperations.store_dataframe_info(
+                            df=st.session_state["dataset"],
+                            name=dataset_name,
+                            description=dataset_description,
+                            source="local_upload"
+                        )
+                        # Store as initial version
+                        initial_version = DatasetVersionOperations.create_version_from_dataframe(
+                            dataset_id=dataset.id,
+                            df=st.session_state["dataset"],
+                            description="Initial version"
+                        )
+                        # Store dataset ID in session state
+                        st.session_state["dataset_id"] = dataset.id
+                        st.success(f"Dataset saved to database with ID: {dataset.id}")
+                        st.success(f"Initial version created: {initial_version.version_id}")
+                        # Rerun to show version control UI
+                        st.experimental_rerun()
+            except ImportError as e:
+                st.subheader("Dataset Version Control")
+                st.error(f"Could not load version control components: {e}")
+                st.info("Please make sure all required components are installed.")
+        else:
+            st.info("Please upload a dataset first.")
+def main():
+    """
+    Main function to run the application.
+    """
+    # Load custom CSS
+    load_css()
+    # Sidebar for navigation
+    st.sidebar.title("ML Dataset & Code Gen Manager")
+    # Navigation
+    page = st.sidebar.radio("Navigation", ["Home", "Dataset Management", "Fine-Tuning", "Code Quality Tools"])
+    # Display selected page
+    if page == "Home":
+        st.title("ML Dataset & Code Generation Manager")
+        st.write("Welcome to the ML Dataset & Code Generation Manager. This platform helps you manage ML datasets and fine-tune code generation models.")
+        # Main features in cards
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("""
+            <div class="card">
+                <h3>Dataset Management</h3>
+                <p>Upload, analyze, visualize, and validate your ML datasets.</p>
+                <ul>
+                    <li>Support for CSV and JSON formats</li>
+                    <li>Statistical analysis and visualization</li>
+                    <li>Data validation and quality checks</li>
+                    <li>Hugging Face Hub integration</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+            st.markdown("""
+            <div class="card">
+                <h3>Code Quality Tools</h3>
+                <p>Tools for ensuring high-quality code.</p>
+                <ul>
+                    <li>Code linting with PyLint</li>
+                    <li>Code formatting with Black and isort</li>
+                    <li>Type checking with MyPy</li>
+                    <li>Testing with PyTest</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+        with col2:
+            st.markdown("""
+            <div class="card">
+                <h3>Fine-Tuning</h3>
+                <p>Fine-tune code generation models on your custom datasets.</p>
+                <ul>
+                    <li>Support for CodeT5, CodeBERT models</li>
+                    <li>Code-to-comment and comment-to-code tasks</li>
+                    <li>Custom dataset preparation</li>
+                    <li>Model testing and evaluation</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+            st.markdown("""
+            <div class="card">
+                <h3>Hugging Face Integration</h3>
+                <p>Seamless integration with Hugging Face Hub.</p>
+                <ul>
+                    <li>Search and load models and datasets</li>
+                    <li>Deploy fine-tuned models to Hugging Face Spaces</li>
+                    <li>Share and collaborate on models and datasets</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+        # Get started section
+        st.subheader("Get Started")
+        st.write("To get started, navigate to the Dataset Management page to upload your data, or explore the Fine-Tuning page to train code generation models.")
+    elif page == "Dataset Management":
+        render_dataset_management_ui()
+    elif page == "Fine-Tuning":
+        render_finetune_ui()
+    elif page == "Code Quality Tools":
+        render_code_quality_ui()
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,33 @@

+[project]
+name = "repl-nix-workspace"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.11"
+dependencies = [
+    "black>=25.1.0",
+    "datasets>=3.3.2",
+    "huggingface-hub>=0.29.1",
+    "isort>=6.0.1",
+    "matplotlib>=3.10.1",
+    "mypy>=1.15.0",
+    "numpy>=2.2.3",
+    "pandas>=2.2.3",
+    "plotly>=6.0.0",
+    "pyarrow>=19.0.1",
+    "pylint>=3.3.4",
+    "pytest>=8.3.4",
+    "scikit-learn>=1.6.1",
+    "sqlalchemy>=2.0.38",
+    "streamlit>=1.42.2",
+    "torch>=2.6.0",
+    "transformers>=4.49.0",
+]
+[[tool.uv.index]]
+explicit = true
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+[tool.uv.sources]
+torch = [{ index = "pytorch-cpu", marker = "platform_system == 'Linux'" }]
+torchvision = [{ index = "pytorch-cpu", marker = "platform_system == 'Linux'" }]

replit.nix ADDED Viewed

	@@ -0,0 +1,18 @@

+{pkgs}: {
+  deps = [
+    pkgs.arrow-cpp
+    pkgs.tk
+    pkgs.tcl
+    pkgs.qhull
+    pkgs.gtk3
+    pkgs.gobject-introspection
+    pkgs.ghostscript
+    pkgs.freetype
+    pkgs.ffmpeg-full
+    pkgs.cairo
+    pkgs.glibcLocales
+    pkgs.xsimd
+    pkgs.pkg-config
+    pkgs.libxcrypt
+  ];
+}

test_app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Simple test file for the ML Dataset & Code Generation Manager application.
+This script checks basic aspects of the application structure and setup.
+"""
+import os
+import sys
+import pandas as pd
+import numpy as np
+from pathlib import Path
+def test_directory_structure():
+    """Test if the required directories exist"""
+    # Ensure necessary directories exist
+    os.makedirs('database/data', exist_ok=True)
+    os.makedirs('assets', exist_ok=True)
+    os.makedirs('fine_tuned_models', exist_ok=True)
+    # Check if directories exist
+    assert Path("database").exists() and Path("database").is_dir(), "Database directory not found"
+    assert Path("assets").exists() and Path("assets").is_dir(), "Assets directory not found"
+    assert Path("fine_tuned_models").exists() and Path("fine_tuned_models").is_dir(), "Fine-tuned models directory not found"
+    print("✅ Directory structure test passed")
+def test_css_file():
+    """Test if the CSS file exists"""
+    css_file = Path("assets/custom.css")
+    assert css_file.exists() and css_file.is_file(), "CSS file not found in assets directory"
+    print("✅ CSS file test passed")
+def test_huggingface_config():
+    """Test if Hugging Face configuration file exists"""
+    config_file = Path("huggingface-spacefile")
+    assert config_file.exists() and config_file.is_file(), "Hugging Face configuration file not found"
+    print("✅ Hugging Face configuration test passed")
+def test_streamlit_config():
+    """Test if Streamlit configuration exists"""
+    config_dir = Path(".streamlit")
+    config_file = config_dir / "config.toml"
+    assert config_dir.exists() and config_dir.is_dir(), ".streamlit directory not found"
+    assert config_file.exists() and config_file.is_file(), "config.toml file not found in .streamlit directory"
+    print("✅ Streamlit configuration test passed")
+def test_sample_dataframe():
+    """Test creation of sample dataframes"""
+    # Create a sample dataframe
+    df = pd.DataFrame({
+        "code": ["def hello():", "import numpy as np", "print('Hello')"],
+        "comment": ["Function greeting", "Import numpy library", "Print hello message"]
+    })
+    # Test dataframe properties
+    assert len(df) == 3
+    assert list(df.columns) == ["code", "comment"]
+    print("✅ Sample dataframe test passed")
+def test_database_initialization():
+    """Test if database can be initialized"""
+    try:
+        from database import init_db
+        init_db()
+        assert Path("database/data/mlmanager.db").exists(), "Database file was not created"
+        print("✅ Database initialization test passed")
+    except ImportError:
+        print("⚠️ Could not import database module")
+        assert False, "Database module not found"
+def run_tests():
+    """Run all tests"""
+    print("Running tests for ML Dataset & Code Generation Manager...")
+    test_directory_structure()
+    test_css_file()
+    test_huggingface_config()
+    test_streamlit_config()
+    test_sample_dataframe()
+    test_database_initialization()
+    print("\nAll tests passed! ✅")
+def test_components_existence():
+    """Test if core components directories exist"""
+    # Check for components directory
+    components_dir = Path("components")
+    assert components_dir.exists() and components_dir.is_dir(), "Components directory not found"
+    # Check for fine_tuning subdirectory
+    fine_tuning_dir = components_dir / "fine_tuning"
+    assert fine_tuning_dir.exists() and fine_tuning_dir.is_dir(), "Fine-tuning components directory not found"
+    # Check for essential component files
+    assert (components_dir / "code_quality.py").exists(), "Code quality component not found"
+    assert (components_dir / "dataset_uploader.py").exists(), "Dataset uploader component not found"
+    print("✅ Components existence test passed")
+# Run the tests if executed directly
+if __name__ == '__main__':
+    run_tests()

utils/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import pandas as pd
+import numpy as np
+def get_dataset_info(df):
+    """
+    Get basic information about a dataset.
+    Args:
+        df: Pandas DataFrame
+    Returns:
+        Dictionary with dataset information
+    """
+    info = {
+        'rows': df.shape[0],
+        'columns': df.shape[1],
+        'missing_values': df.isna().sum().sum(),
+        'duplicate_rows': df.duplicated().sum(),
+        'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
+        'column_types': df.dtypes.astype(str).value_counts().to_dict(),
+        'column_info': []
+    }
+    # Get info for each column
+    for col in df.columns:
+        col_info = {
+            'name': col,
+            'type': str(df[col].dtype),
+            'missing': df[col].isna().sum(),
+            'missing_pct': (df[col].isna().sum() / len(df)) * 100,
+            'unique_values': df[col].nunique()
+        }
+        # Add additional info for numeric columns
+        if pd.api.types.is_numeric_dtype(df[col]):
+            col_info.update({
+                'min': df[col].min(),
+                'max': df[col].max(),
+                'mean': df[col].mean(),
+                'median': df[col].median(),
+                'std': df[col].std()
+            })
+        # Add additional info for categorical/text columns
+        elif pd.api.types.is_object_dtype(df[col]):
+            # Get top values
+            value_counts = df[col].value_counts().head(5).to_dict()
+            col_info['top_values'] = value_counts
+            # Estimate if it's a categorical column
+            if df[col].nunique() / len(df) < 0.1:  # If less than 10% of rows have unique values
+                col_info['likely_categorical'] = True
+            else:
+                col_info['likely_categorical'] = False
+        info['column_info'].append(col_info)
+    return info
+def detect_dataset_format(df):
+    """
+    Try to detect the format/type of the dataset based on its structure.
+    Args:
+        df: Pandas DataFrame
+    Returns:
+        String indicating the likely format
+    """
+    # Check for text data
+    text_cols = 0
+    for col in df.columns:
+        if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100:
+            text_cols += 1
+    if text_cols / len(df.columns) > 0.5:
+        return "text"
+    # Check for time series data
+    date_cols = 0
+    for col in df.columns:
+        if pd.api.types.is_datetime64_dtype(df[col]):
+            date_cols += 1
+    if date_cols > 0:
+        return "time_series"
+    # Check if it looks like tabular data
+    numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
+    categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns)
+    if numeric_cols > 0 and categorical_cols > 0:
+        return "mixed"
+    elif numeric_cols > 0:
+        return "numeric"
+    elif categorical_cols > 0:
+        return "categorical"
+    # Default
+    return "generic"
+def check_column_completeness(df, threshold=0.8):
+    """
+    Check if columns have good completeness (less than 20% missing values by default).
+    Args:
+        df: Pandas DataFrame
+        threshold: Completeness threshold (0.8 = 80% complete)
+    Returns:
+        List of columns with poor completeness
+    """
+    results = []
+    for col in df.columns:
+        missing_ratio = df[col].isna().sum() / len(df)
+        completeness = 1 - missing_ratio
+        if completeness < threshold:
+            results.append({
+                'Column': col,
+                'Completeness': f"{completeness:.2%}",
+                'Missing': f"{missing_ratio:.2%}",
+                'Recommendation': 'Consider imputing or removing this column'
+            })
+    return results
+def detect_outliers(series, method='iqr', factor=1.5):
+    """
+    Detect outliers in a pandas Series using IQR or Z-score method.
+    Args:
+        series: Pandas Series with numeric values
+        method: 'iqr' or 'zscore'
+        factor: Multiplier for IQR or Z-score threshold
+    Returns:
+        Tuple of (outlier_indices, lower_bound, upper_bound)
+    """
+    if method == 'iqr':
+        # IQR method
+        q1 = series.quantile(0.25)
+        q3 = series.quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - factor * iqr
+        upper_bound = q3 + factor * iqr
+        outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist()
+    else:  # zscore
+        # Z-score method
+        from scipy import stats
+        z_scores = stats.zscore(series.dropna())
+        abs_z_scores = abs(z_scores)
+        # Filter for Z-scores above threshold
+        outlier_indices = np.where(abs_z_scores > factor)[0]
+        outliers = series.dropna().iloc[outlier_indices].index.tolist()
+        # Compute equivalent bounds for consistency
+        mean = series.mean()
+        std = series.std()
+        lower_bound = mean - factor * std
+        upper_bound = mean + factor * std
+    return outliers, lower_bound, upper_bound

utils/huggingface_integration.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+import pandas as pd
+import os
+from huggingface_hub import HfApi, list_datasets
+from datasets import load_dataset
+@st.cache_data(ttl=3600)
+def search_huggingface_datasets(query, limit=20):
+    """
+    Search for datasets on Hugging Face Hub.
+    Args:
+        query: Search query string
+        limit: Maximum number of results to return
+    Returns:
+        List of dataset metadata
+    """
+    try:
+        api = HfApi()
+        datasets = list_datasets(
+            filter=query,
+            limit=limit
+        )
+        # Convert to list of dicts with relevant info
+        results = []
+        for dataset in datasets:
+            results.append({
+                'id': dataset.id,
+                'name': dataset.id.split('/')[-1],
+                'description': dataset.description or "No description available",
+                'author': dataset.author or "Unknown",
+                'tags': dataset.tags,
+                'downloads': dataset.downloads
+            })
+        return results
+    except Exception as e:
+        st.error(f"Error searching Hugging Face Hub: {str(e)}")
+        return []
+@st.cache_data(ttl=3600)
+def load_huggingface_dataset(dataset_id, split='train'):
+    """
+    Load a dataset from Hugging Face Hub.
+    Args:
+        dataset_id: ID of the dataset on HF Hub (e.g., 'mnist', 'glue', etc.)
+        split: Dataset split to load (e.g., 'train', 'test', 'validation')
+    Returns:
+        Pandas DataFrame containing the dataset
+    """
+    try:
+        # Load the dataset
+        dataset = load_dataset(dataset_id, split=split)
+        # Convert to pandas DataFrame
+        df = dataset.to_pandas()
+        return df
+    except Exception as e:
+        st.error(f"Error loading dataset '{dataset_id}': {str(e)}")
+        raise
+def upload_to_huggingface(dataset, dataset_name, token=None):
+    """
+    Upload a dataset to Hugging Face Hub.
+    Args:
+        dataset: Pandas DataFrame to upload
+        dataset_name: Name for the dataset
+        token: Hugging Face API token (optional, will use environment variable if not provided)
+    Returns:
+        URL to the uploaded dataset
+    """
+    # Get token from environment if not provided
+    if token is None:
+        token = os.getenv("HF_TOKEN")
+        if not token:
+            raise ValueError("No Hugging Face token provided. Set the HF_TOKEN environment variable or pass a token.")
+    try:
+        # Convert to HF dataset
+        from datasets import Dataset
+        hf_dataset = Dataset.from_pandas(dataset)
+        # Upload to HF Hub
+        push_result = hf_dataset.push_to_hub(
+            dataset_name,
+            token=token
+        )
+        return f"https://huggingface.co/datasets/{push_result.repo_id}"
+    except Exception as e:
+        st.error(f"Error uploading to Hugging Face Hub: {str(e)}")
+        raise

utils/smolagents_integration.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+def process_with_smolagents(dataset, operation, custom_code=None):
+    """
+    Process dataset using SmolaAgents for various operations.
+    Args:
+        dataset: Pandas DataFrame to process
+        operation: Type of processing operation
+        custom_code: Custom code to execute (for custom processing)
+    Returns:
+        Processed pandas DataFrame
+    """
+    if dataset is None:
+        raise ValueError("No dataset provided")
+    # Create a copy to avoid modifying the original
+    processed_df = dataset.copy()
+    try:
+        if operation == "Data Cleaning":
+            processed_df = clean_dataset(processed_df)
+        elif operation == "Feature Engineering":
+            processed_df = engineer_features(processed_df)
+        elif operation == "Data Transformation":
+            processed_df = transform_dataset(processed_df)
+        elif operation == "Custom Processing" and custom_code:
+            # Execute custom code
+            # Note: This is a security risk in a real application
+            # Should be replaced with a safer approach
+            local_vars = {"df": processed_df}
+            exec(custom_code, {"pd": pd, "np": np}, local_vars)
+            processed_df = local_vars["df"]
+        else:
+            raise ValueError(f"Unsupported operation: {operation}")
+        return processed_df
+    except Exception as e:
+        st.error(f"Error during processing: {str(e)}")
+        raise
+def clean_dataset(df):
+    """
+    Clean the dataset by handling missing values, duplicates, and outliers.
+    Args:
+        df: Pandas DataFrame to clean
+    Returns:
+        Cleaned pandas DataFrame
+    """
+    # Create a copy to avoid modifying the original
+    cleaned_df = df.copy()
+    # Remove duplicate rows
+    cleaned_df = cleaned_df.drop_duplicates()
+    # Handle missing values
+    for col in cleaned_df.columns:
+        # For numeric columns
+        if pd.api.types.is_numeric_dtype(cleaned_df[col]):
+            # If more than 20% missing, leave as is
+            if cleaned_df[col].isna().mean() > 0.2:
+                continue
+            # Otherwise impute with median
+            cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
+        # For categorical columns
+        elif pd.api.types.is_object_dtype(cleaned_df[col]):
+            # If more than 20% missing, leave as is
+            if cleaned_df[col].isna().mean() > 0.2:
+                continue
+            # Otherwise impute with mode
+            mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown"
+            cleaned_df[col] = cleaned_df[col].fillna(mode_value)
+    # Handle outliers in numeric columns
+    for col in cleaned_df.select_dtypes(include=[np.number]).columns:
+        # Skip if too many missing values
+        if cleaned_df[col].isna().mean() > 0.1:
+            continue
+        # Calculate IQR
+        q1 = cleaned_df[col].quantile(0.25)
+        q3 = cleaned_df[col].quantile(0.75)
+        iqr = q3 - q1
+        # Define bounds
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        # Cap outliers instead of removing
+        cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
+    return cleaned_df
+def engineer_features(df):
+    """
+    Perform basic feature engineering on the dataset.
+    Args:
+        df: Pandas DataFrame to process
+    Returns:
+        DataFrame with engineered features
+    """
+    # Create a copy to avoid modifying the original
+    engineered_df = df.copy()
+    # Get numeric columns
+    numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns
+    # Skip if less than 2 numeric columns
+    if len(numeric_cols) >= 2:
+        # Create interaction features for pairs of numeric columns
+        # Limit to first 5 columns to avoid feature explosion
+        for i, col1 in enumerate(numeric_cols[:5]):
+            for col2 in numeric_cols[i+1:5]:
+                # Product interaction
+                engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2]
+                # Ratio interaction (avoid division by zero)
+                denominator = engineered_df[col2].replace(0, np.nan)
+                engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator
+    # Create binary features from categorical columns
+    cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns
+    for col in cat_cols:
+        # Skip if too many unique values (>10)
+        if engineered_df[col].nunique() > 10:
+            continue
+        # One-hot encode
+        dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True)
+        engineered_df = pd.concat([engineered_df, dummies], axis=1)
+    # Create aggregated features
+    if len(numeric_cols) >= 3:
+        # Sum of all numeric features
+        engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1)
+        # Mean of all numeric features
+        engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1)
+        # Standard deviation of numeric features
+        engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1)
+    return engineered_df
+def transform_dataset(df):
+    """
+    Perform data transformations on the dataset.
+    Args:
+        df: Pandas DataFrame to transform
+    Returns:
+        Transformed pandas DataFrame
+    """
+    from sklearn.preprocessing import StandardScaler, MinMaxScaler
+    # Create a copy to avoid modifying the original
+    transformed_df = df.copy()
+    # Get numeric columns
+    numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) > 0:
+        # Create scaled versions of numeric columns
+        # Standard scaling (z-score)
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(transformed_df[numeric_cols])
+        scaled_df = pd.DataFrame(
+            scaled_data,
+            columns=[f"{col}_scaled" for col in numeric_cols],
+            index=transformed_df.index
+        )
+        # Min-max scaling (0-1 range)
+        minmax_scaler = MinMaxScaler()
+        minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols])
+        minmax_df = pd.DataFrame(
+            minmax_data,
+            columns=[f"{col}_normalized" for col in numeric_cols],
+            index=transformed_df.index
+        )
+        # Log transform (for positive columns only)
+        log_cols = []
+        for col in numeric_cols:
+            if (transformed_df[col] > 0).all():
+                transformed_df[f"{col}_log"] = np.log(transformed_df[col])
+                log_cols.append(f"{col}_log")
+        # Combine all transformations
+        transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1)
+    # One-hot encode categorical columns
+    cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns
+    if len(cat_cols) > 0:
+        # One-hot encode all categorical columns
+        transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False)
+    return transformed_df

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff