Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 20

Commit

ad9e004

verified ·

1 Parent(s): cedb0a7

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -139

app.py CHANGED Viewed

@@ -1,162 +1,158 @@
-import os
 import gradio as gr
 import pandas as pd
 import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
 from typing import Dict, List, Optional
 import openai
-from dataclasses import dataclass
-import plotly.express as px
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-import statsmodels.api as sm
-# System prompt for data analysis
-DATA_ANALYSIS_PROMPT = """
-<DataScienceExpertFramework version="1.0">
-    <Identity>
-        <Description>You are an expert data scientist who combines technical precision with clear insights.</Description>
-    </Identity>
-    <CoreCapabilities>
-        <Analysis>
-            <Capability>Statistical analysis and hypothesis testing</Capability>
-            <Capability>Pattern recognition and insights</Capability>
-            <Capability>Data visualization recommendations</Capability>
-        </Analysis>
-    </CoreCapabilities>
-    <AnalysisApproach>
-        <Step>Assess data quality and structure</Step>
-        <Step>Identify key patterns and relationships</Step>
-        <Step>Perform statistical analysis</Step>
-        <Step>Generate visualizations</Step>
-        <Step>Provide actionable insights</Step>
-    </AnalysisApproach>
-</DataScienceExpertFramework>
-"""
-def format_stats_results(results: Dict) -> str:
-    """Format statistical results for display"""
-    formatted = []
-    for test_name, result in results.items():
-        if "normality" in test_name:
-            formatted.append(f"- {test_name}: {'Normal' if result['is_normal'] else 'Non-normal'} "
-                           f"(p={result['p_value']:.4f})")
-        elif "correlation" in test_name:
-            formatted.append(f"- {test_name}: {result['correlation']:.4f} "
-                           f"(p={result['p_value']:.4f})")
-    return "\n".join(formatted)
-def analyze_data(df: pd.DataFrame) -> Dict:
-    """Analyze dataframe and return statistics"""
-    analysis = {
-        "shape": df.shape,
-        "dtypes": df.dtypes.to_dict(),
-        "missing": df.isnull().sum().to_dict(),
-        "numeric_summary": df.describe().to_dict(),
-        "correlations": {}
     }
-    # Calculate correlations for numeric columns
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    if len(numeric_cols) >= 2:
-        corr_matrix = df[numeric_cols].corr()
-        analysis["correlations"] = corr_matrix.to_dict()
-    return analysis
-def create_visualizations(df: pd.DataFrame, save_dir: str = "figures") -> List[str]:
-    """Create and save visualizations"""
-    os.makedirs(save_dir, exist_ok=True)
-    paths = []
-    # Correlation heatmap
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    if len(numeric_cols) >= 2:
-        plt.figure(figsize=(10, 8))
-        sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
-        plt.title("Correlation Heatmap")
-        path = os.path.join(save_dir, "correlation_heatmap.png")
-        plt.savefig(path)
-        plt.close()
-        paths.append(path)
-    # Distribution plots for numeric columns
-    for col in numeric_cols[:5]:  # Limit to first 5 columns
         plt.figure(figsize=(10, 6))
-        sns.histplot(df[col], kde=True)
-        plt.title(f"Distribution of {col}")
-        path = os.path.join(save_dir, f"dist_{col}.png")
-        plt.savefig(path)
         plt.close()
-        paths.append(path)
-    return paths
-def chat_with_data_scientist(message: str, history: List, api_key: str, df: Optional[pd.DataFrame] = None) -> List:
-    """Chat with GPT-4o-mini about data analysis"""
-    if not api_key:
-        return history + [
-            ("Please provide an API key to continue.", None)
-        ]
-    if df is None:
-        return history + [
-            ("Please upload a CSV file to analyze.", None)
-        ]
-    try:
-        client = openai.OpenAI(api_key=api_key)
-        # Create analysis summary
-        analysis = analyze_data(df)
-        analysis_text = f"""
-        Dataset Shape: {analysis['shape']}
-        Missing Values: {sum(analysis['missing'].values())}
-        Numeric Columns: {list(analysis['numeric_summary'].keys())}
-        """
-        messages = [
-            {"role": "system", "content": DATA_ANALYSIS_PROMPT},
-            {"role": "system", "content": f"Analysis Context:\n{analysis_text}"},
-            {"role": "user", "content": message}
-        ]
-        response = client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=messages,
-            max_tokens=500
-        )
-        return history + [
-            (message, response.choices[0].message.content)
-        ]
-    except Exception as e:
-        return history + [
-            (message, f"Error: {str(e)}")
-        ]
 def create_demo():
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🔬 Data Science Expert")
         with gr.Row():
             with gr.Column():
                 api_key = gr.Textbox(
-                    label="GPT-4o-mini API Key",
-                    placeholder="sk-...",
-                    type="password"
                 )
                 file_input = gr.File(
-                    label="Upload CSV file",
                     file_types=[".csv"]
                 )
-                system_prompt = gr.Textbox(
-                    label="System Prompt",
-                    value=DATA_ANALYSIS_PROMPT,
-                    lines=5
-                )
             with gr.Column():
                 chat = gr.Chatbot(label="Analysis Chat")
@@ -166,7 +162,7 @@ def create_demo():
                 )
                 clear = gr.Button("Clear")
-        # Store DataFrame in state
         df_state = gr.State(None)
         def process_file(file):
@@ -174,6 +170,29 @@ def create_demo():
                 return None
             return pd.read_csv(file.name)
         file_input.change(
             process_file,
             inputs=[file_input],
@@ -181,7 +200,7 @@ def create_demo():
         )
         msg.submit(
-            chat_with_data_scientist,
             inputs=[msg, chat, api_key, df_state],
             outputs=[chat]
         )
@@ -190,9 +209,8 @@ def create_demo():
         return demo
-demo = create_demo()
 if __name__ == "__main__":
     demo.launch()
 else:
     demo.launch(show_api=False)

+from transformers import Tool, ReactCodeAgent, HfApiEngine
 import gradio as gr
 import pandas as pd
 import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
 from typing import Dict, List, Optional
 import openai
+import seaborn as sns
+import matplotlib.pyplot as plt
+import io
+import base64
+# Custom Tools for Data Analysis
+class DataVisualizationTool(Tool):
+    name = "data_visualizer"
+    description = """Creates various types of visualizations from data:
+    - Correlation heatmaps
+    - Distribution plots
+    - Scatter plots
+    - Time series plots
+    Returns the plots as base64 encoded images."""
+    inputs = {
+        "data": {
+            "type": "dict",
+            "description": "DataFrame as dictionary"
+        },
+        "plot_type": {
+            "type": "string",
+            "description": "Type of plot to create: 'heatmap', 'distribution', 'scatter'"
+        },
+        "columns": {
+            "type": "list",
+            "description": "List of columns to plot"
+        }
     }
+    output_type = "string"  # base64 encoded image
+    def forward(self, data: Dict, plot_type: str, columns: List[str]) -> str:
+        df = pd.DataFrame(data)
         plt.figure(figsize=(10, 6))
+        if plot_type == "heatmap":
+            sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm')
+            plt.title("Correlation Heatmap")
+        elif plot_type == "distribution":
+            for col in columns:
+                sns.histplot(df[col], kde=True, label=col)
+            plt.title("Distribution Plot")
+            plt.legend()
+        elif plot_type == "scatter":
+            if len(columns) >= 2:
+                sns.scatterplot(data=df, x=columns[0], y=columns[1])
+                plt.title(f"Scatter Plot: {columns[0]} vs {columns[1]}")
+        # Convert plot to base64
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png')
         plt.close()
+        buf.seek(0)
+        return base64.b64encode(buf.read()).decode('utf-8')
+class DataAnalysisTool(Tool):
+    name = "data_analyzer"
+    description = """Performs statistical analysis on data:
+    - Basic statistics (mean, median, std)
+    - Correlation analysis
+    - Missing value analysis
+    - Outlier detection"""
+    inputs = {
+        "data": {
+            "type": "dict",
+            "description": "DataFrame as dictionary"
+        },
+        "analysis_type": {
+            "type": "string",
+            "description": "Type of analysis: 'basic', 'correlation', 'missing', 'outliers'"
+        },
+        "columns": {
+            "type": "list",
+            "description": "List of columns to analyze"
+        }
+    }
+    output_type = "dict"
+    def forward(self, data: Dict, analysis_type: str, columns: List[str]) -> Dict:
+        df = pd.DataFrame(data)
+        selected_cols = [col for col in columns if col in df.columns]
+        if analysis_type == "basic":
+            return {
+                "statistics": df[selected_cols].describe().to_dict(),
+                "skew": df[selected_cols].skew().to_dict(),
+                "kurtosis": df[selected_cols].kurtosis().to_dict()
+            }
+        elif analysis_type == "correlation":
+            numeric_cols = df[selected_cols].select_dtypes(include=[np.number])
+            return {
+                "correlation": numeric_cols.corr().to_dict(),
+                "covariance": numeric_cols.cov().to_dict()
+            }
+        elif analysis_type == "missing":
+            return {
+                "missing_counts": df[selected_cols].isnull().sum().to_dict(),
+                "missing_percentages": (df[selected_cols].isnull().mean() * 100).to_dict()
+            }
+        elif analysis_type == "outliers":
+            outliers = {}
+            for col in selected_cols:
+                if df[col].dtype in [np.float64, np.int64]:
+                    Q1 = df[col].quantile(0.25)
+                    Q3 = df[col].quantile(0.75)
+                    IQR = Q3 - Q1
+                    outliers[col] = {
+                        "outliers_count": len(df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]),
+                        "lower_bound": Q1 - 1.5 * IQR,
+                        "upper_bound": Q3 + 1.5 * IQR
+                    }
+            return {"outliers": outliers}
 def create_demo():
+    # Initialize tools
+    viz_tool = DataVisualizationTool()
+    analysis_tool = DataAnalysisTool()
+    # Create agent with tools
+    llm_engine = HfApiEngine()  # Uses default model
+    agent = ReactCodeAgent(
+        tools=[viz_tool, analysis_tool],
+        llm_engine=llm_engine
+    )
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🔬 Advanced Data Analysis Agent")
         with gr.Row():
             with gr.Column():
                 api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    type="password",
+                    placeholder="sk-..."
                 )
                 file_input = gr.File(
+                    label="Upload CSV",
                     file_types=[".csv"]
                 )
+                with gr.Accordion("Advanced Settings", open=False):
+                    system_prompt = gr.Textbox(
+                        label="System Prompt",
+                        value="""You are a data science expert. Analyze the data and create
+                               visualizations to help understand patterns and insights.""",
+                        lines=3
+                    )
             with gr.Column():
                 chat = gr.Chatbot(label="Analysis Chat")
                 )
                 clear = gr.Button("Clear")
+        # State for storing the DataFrame
         df_state = gr.State(None)
         def process_file(file):
                 return None
             return pd.read_csv(file.name)
+        def process_message(message, chat_history, api_key, df):
+            if df is None:
+                return chat_history + [(message, "Please upload a CSV file first.")]
+            try:
+                # Convert DataFrame to dict for tools
+                data_dict = df.to_dict()
+                # Get all columns for potential analysis
+                columns = list(df.columns)
+                # Use agent to analyze and create visualizations
+                response = agent.run(
+                    f"""Analyze this data: {message}
+                    Available columns: {columns}
+                    Use the data_analyzer and data_visualizer tools to create insights."""
+                )
+                return chat_history + [(message, response)]
+            except Exception as e:
+                return chat_history + [(message, f"Error: {str(e)}")]
         file_input.change(
             process_file,
             inputs=[file_input],
         )
         msg.submit(
+            process_message,
             inputs=[msg, chat, api_key, df_state],
             outputs=[chat]
         )
         return demo
 if __name__ == "__main__":
+    demo = create_demo()
     demo.launch()
 else:
     demo.launch(show_api=False)