Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 20

Commit

cedb0a7

verified ·

1 Parent(s): dbaa1d5

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -316

app.py CHANGED Viewed

@@ -1,346 +1,198 @@
 import os
-import requests
 import gradio as gr
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from typing import Dict, List, Tuple, Optional
 from dataclasses import dataclass
-from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.impute import SimpleImputer
 import statsmodels.api as sm
-import plotly.express as px
-import plotly.graph_objects as go
-from scipy import stats
-@dataclass
-class AnalysisConfig:
-    """Configuration for analysis parameters"""
-    max_iterations: int = 5
-    min_samples_for_analysis: int = 30
-    correlation_threshold: float = 0.7
-    max_categories_for_viz: int = 10
-    significance_level: float = 0.05
-class DataAnalyzer:
-    """Intelligent data analysis agent that determines appropriate visualizations and analyses"""
-    def __init__(self, api_key: str):
-        self.api_key = api_key
-        self.config = AnalysisConfig()
-        self.current_iteration = 0
-        self.analysis_results = []
-    def call_gpt4o_mini(self, prompt: str, system_prompt: str) -> str:
-        """Call GPT-4o-mini API with proper error handling"""
-        try:
-            client = openai.OpenAI(api_key=self.api_key)
-            messages = [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt}
-            ]
-            response = client.chat.completions.create(
-                model="gpt-4o-mini",
-                messages=messages,
-                max_tokens=500,
-                temperature=0.7
-            )
-            return response.choices[0].message.content
-        except Exception as e:
-            return f"API Error: {str(e)}"
-    def evaluate_code(self, code: str, state: Dict = None) -> Tuple[Any, str]:
-        """Safely evaluate Python code with proper state management and security"""
-        if state is None:
-            state = {"print_outputs": ""}
-        # Create safe environment with allowed imports
-        safe_env = {
-            "pd": pd,
-            "np": np,
-            "plt": plt,
-            "sns": sns,
-            "stats": stats,
-            "print": lambda *args: state.update({"print_outputs": state["print_outputs"] + " ".join(map(str, args)) + "\n"}),
-        }
-        try:
-            exec(code, safe_env, state)
-            return state.get("result", None), state["print_outputs"]
-        except Exception as e:
-            raise RuntimeError(f"Code execution failed: {str(e)}")
-    def analyze_data_types(self, df: pd.DataFrame) -> Dict:
-        """Analyze data types and basic statistics of the DataFrame"""
-        analysis = {
-            "numeric_cols": df.select_dtypes(include=['int64', 'float64']).columns.tolist(),
-            "categorical_cols": df.select_dtypes(include=['object', 'category']).columns.tolist(),
-            "temporal_cols": df.select_dtypes(include=['datetime64']).columns.tolist(),
-            "missing_values": df.isnull().sum().to_dict(),
-            "unique_counts": df.nunique().to_dict()
-        }
-        return analysis
-    def create_visualization(self, df: pd.DataFrame, viz_type: str, columns: List[str]) -> str:
-        """Create and save visualization based on data types and relationships"""
         plt.figure(figsize=(10, 6))
-        if viz_type == "correlation":
-            sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm')
-            plt.title("Correlation Matrix")
-        elif viz_type == "distribution":
-            for col in columns:
-                sns.histplot(data=df, x=col, kde=True)
-                plt.title(f"Distribution of {col}")
-        elif viz_type == "boxplot":
-            sns.boxplot(data=df[columns])
-            plt.title("Box Plot of Numeric Variables")
-        output_path = f"viz_{self.current_iteration}.png"
-        plt.savefig(output_path)
         plt.close()
-        return output_path
-    def perform_statistical_tests(self, df: pd.DataFrame, data_types: Dict) -> Dict:
-        """Perform relevant statistical tests based on data types"""
-        results = {}
-        # Normality tests for numeric columns
-        for col in data_types["numeric_cols"]:
-            if len(df[col].dropna()) > 3:
-                stat, p_value = stats.normaltest(df[col].dropna())
-                results[f"normality_{col}"] = {
-                    "statistic": stat,
-                    "p_value": p_value,
-                    "is_normal": p_value > self.config.significance_level
-                }
-        # Chi-square tests for categorical columns
-        for col1 in data_types["categorical_cols"]:
-            for col2 in data_types["categorical_cols"]:
-                if col1 < col2:
-                    contingency = pd.crosstab(df[col1], df[col2])
-                    chi2, p_value, _, _ = stats.chi2_contingency(contingency)
-                    results[f"chi2_{col1}_{col2}"] = {
-                        "statistic": chi2,
-                        "p_value": p_value,
-                        "is_significant": p_value < self.config.significance_level
-                    }
-        return results
-    def train_predictive_model(self, df: pd.DataFrame, target_col: str) -> Tuple[float, str]:
-        """Train and evaluate a predictive model based on data characteristics"""
-        X = df.drop(columns=[target_col])
-        y = df[target_col]
-        # Preprocessing
-        numeric_transformer = Pipeline([
-            ('imputer', SimpleImputer(strategy='median')),
-            ('scaler', StandardScaler())
-        ])
-        categorical_transformer = Pipeline([
-            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-            ('onehot', OneHotEncoder(handle_unknown='ignore'))
-        ])
-        preprocessor = ColumnTransformer(
-            transformers=[
-                ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
-                ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
-            ])
-        if len(np.unique(y)) <= 5:  # Classification
-            model = RandomForestClassifier(n_estimators=100, random_state=42)
-            metric = 'accuracy'
-        else:  # Regression
-            model = RandomForestRegressor(n_estimators=100, random_state=42)
-            metric = 'r2'
-        pipeline = Pipeline([
-            ('preprocessor', preprocessor),
-            ('model', model)
-        ])
-        # Train and evaluate
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-        pipeline.fit(X_train, y_train)
-        y_pred = pipeline.predict(X_test)
-        if metric == 'accuracy':
-            score = accuracy_score(y_test, y_pred)
-        else:
-            score = r2_score(y_test, y_pred)
-        return score, metric
-class GradioInterface:
-    """Gradio interface for the data analysis agent"""
-    def __init__(self):
-        self.analyzer = None
-        self.df = None
-    DEFAULT_SYSTEM_PROMPT = """
-<DataScienceExpertFramework version="1.0">
-    <Identity>
-        <Description>
-            You are an expert data scientist and analyst who combines technical precision with clear communication. You specialize in uncovering insights through advanced statistical analysis, machine learning, and data visualization.
-        </Description>
-    </Identity>
-    <CoreCapabilities>
-        <Analysis>
-            <Capability>Advanced statistical analysis and hypothesis testing</Capability>
-            <Capability>Machine learning model development and evaluation</Capability>
-            <Capability>Data visualization and exploratory data analysis</Capability>
-            <Capability>Pattern recognition and trend identification</Capability>
-            <Capability>Feature engineering and selection</Capability>
-        </Analysis>
-        <Communication>
-            <Style>Clear and precise technical explanations</Style>
-            <Style>Business-oriented insights translation</Style>
-            <Style>Visual representation of complex patterns</Style>
-        </Communication>
-    </CoreCapabilities>
-    <AnalysisApproach>
-        <Step>Data Quality Assessment</Step>
-        <Step>Exploratory Data Analysis</Step>
-        <Step>Statistical Testing</Step>
-        <Step>Pattern Recognition</Step>
-        <Step>Insight Generation</Step>
-        <Step>Visualization Creation</Step>
-        <Step>Recommendations Development</Step>
-    </AnalysisApproach>
-    <OutputGuidelines>
-        <Format>
-            <Section>Key Findings Summary</Section>
-            <Section>Detailed Statistical Analysis</Section>
-            <Section>Visualization Descriptions</Section>
-            <Section>Actionable Recommendations</Section>
-        </Format>
-        <Standards>
-            <Standard>Always explain statistical significance</Standard>
-            <Standard>Provide context for numerical findings</Standard>
-            <Standard>Highlight practical implications</Standard>
-            <Standard>Address data limitations</Standard>
-        </Standards>
-    </OutputGuidelines>
-</DataScienceExpertFramework>
-"""
-def create_interface(self):
-        with gr.Blocks(theme=gr.themes.Soft()) as demo:
-            gr.Markdown("# 🔍 Intelligent Data Analysis Agent")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    api_key = gr.Textbox(
-                        label="GPT-4o-mini API Key",
-                        type="password",
-                        placeholder="sk-..."
-                    )
-                    file_input = gr.File(
-                        label="Upload CSV file"
-                    )
-                    with gr.Accordion("⚙️ Advanced Settings", open=False):
-                        system_prompt = gr.TextArea(
-                            label="System Prompt",
-                            value=DEFAULT_SYSTEM_PROMPT,
-                            lines=8
-                        )
-            with gr.Row():
-                analysis_notes = gr.Textbox(
-                    label="Analysis Notes (Optional)",
-                    placeholder="Any specific analysis preferences...")
-            with gr.Row():
-                analyze_btn = gr.Button("Analyze Data")
-                clear_btn = gr.Button("Clear")
-            output_text = gr.Markdown()
-            output_gallery = gr.Gallery()
-            def analyze(api_key, file, notes, system_prompt):
-                if not api_key or not file:
-                    return "Please provide both API key and data file.", None
-                try:
-                    self.df = pd.read_csv(file.name)
-                    self.analyzer = DataAnalyzer(api_key)
-                    # Get AI suggestions for analysis
-                    prompt = f"Data columns: {list(self.df.columns)}\nUser notes: {notes}\nSuggest appropriate analyses and visualizations."
-                    ai_suggestions = self.analyzer.call_gpt4o_mini(prompt)
-                    # Perform analysis
-                    data_types = self.analyzer.analyze_data_types(self.df)
-                    stats_results = self.analyzer.perform_statistical_tests(self.df, data_types)
-                    # Create visualizations
-                    viz_paths = []
-                    for viz_type in ["correlation", "distribution", "boxplot"]:
-                        if data_types["numeric_cols"]:
-                            path = self.analyzer.create_visualization(
-                                self.df, viz_type, data_types["numeric_cols"]
-                            )
-                            viz_paths.append(path)
-                    # Generate summary
-                    summary = f"""
-                    ## Data Analysis Results
-                    ### AI Suggestions
-                    {ai_suggestions}
-                    ### Basic Statistics
-                    - Rows: {len(self.df)}
-                    - Columns: {len(self.df.columns)}
-                    - Missing Values: {sum(data_types['missing_values'].values())}
-                    ### Statistical Tests
-                    {self._format_stats_results(stats_results)}
-                    """
-                    return summary, viz_paths
-                except Exception as e:
-                    return f"Error during analysis: {str(e)}", None
-            analyze_btn.click(
-                analyze,
-                inputs=[api_key, file_input, analysis_notes, system_prompt],
-                outputs=[output_text, output_gallery]
-            )
-            clear_btn.click(
-                lambda: (None, None),
-                outputs=[output_text, output_gallery]
-            )
         return demo
-        @staticmethod
-    def _format_stats_results(results: Dict) -> str:
-        """Format statistical results for display"""
-        formatted = []
-        for test_name, result in results.items():
-            if "normality" in test_name:
-                formatted.append(f"- {test_name}: {'Normal' if result['is_normal'] else 'Non-normal'} "
-                               f"(p={result['p_value']:.4f})")
-            elif "chi2" in test_name:
-                formatted.append(f"- {test_name}: {'Significant' if result['is_significant'] else 'Not significant'} "
-                               f"(p={result['p_value']:.4f})")
-        return "\n".join(formatted)
 if __name__ == "__main__":
-    interface = GradioInterface()
-    demo = interface.create_interface()
-    demo.launch(share=True)

 import os
 import gradio as gr
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
+from typing import Dict, List, Optional
+import openai
 from dataclasses import dataclass
+import plotly.express as px
+from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 import statsmodels.api as sm
+# System prompt for data analysis
+DATA_ANALYSIS_PROMPT = """
+<DataScienceExpertFramework version="1.0">
+    <Identity>
+        <Description>You are an expert data scientist who combines technical precision with clear insights.</Description>
+    </Identity>
+    <CoreCapabilities>
+        <Analysis>
+            <Capability>Statistical analysis and hypothesis testing</Capability>
+            <Capability>Pattern recognition and insights</Capability>
+            <Capability>Data visualization recommendations</Capability>
+        </Analysis>
+    </CoreCapabilities>
+    <AnalysisApproach>
+        <Step>Assess data quality and structure</Step>
+        <Step>Identify key patterns and relationships</Step>
+        <Step>Perform statistical analysis</Step>
+        <Step>Generate visualizations</Step>
+        <Step>Provide actionable insights</Step>
+    </AnalysisApproach>
+</DataScienceExpertFramework>
+"""
+def format_stats_results(results: Dict) -> str:
+    """Format statistical results for display"""
+    formatted = []
+    for test_name, result in results.items():
+        if "normality" in test_name:
+            formatted.append(f"- {test_name}: {'Normal' if result['is_normal'] else 'Non-normal'} "
+                           f"(p={result['p_value']:.4f})")
+        elif "correlation" in test_name:
+            formatted.append(f"- {test_name}: {result['correlation']:.4f} "
+                           f"(p={result['p_value']:.4f})")
+    return "\n".join(formatted)
+def analyze_data(df: pd.DataFrame) -> Dict:
+    """Analyze dataframe and return statistics"""
+    analysis = {
+        "shape": df.shape,
+        "dtypes": df.dtypes.to_dict(),
+        "missing": df.isnull().sum().to_dict(),
+        "numeric_summary": df.describe().to_dict(),
+        "correlations": {}
+    }
+    # Calculate correlations for numeric columns
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) >= 2:
+        corr_matrix = df[numeric_cols].corr()
+        analysis["correlations"] = corr_matrix.to_dict()
+    return analysis
+def create_visualizations(df: pd.DataFrame, save_dir: str = "figures") -> List[str]:
+    """Create and save visualizations"""
+    os.makedirs(save_dir, exist_ok=True)
+    paths = []
+    # Correlation heatmap
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) >= 2:
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
+        plt.title("Correlation Heatmap")
+        path = os.path.join(save_dir, "correlation_heatmap.png")
+        plt.savefig(path)
+        plt.close()
+        paths.append(path)
+    # Distribution plots for numeric columns
+    for col in numeric_cols[:5]:  # Limit to first 5 columns
         plt.figure(figsize=(10, 6))
+        sns.histplot(df[col], kde=True)
+        plt.title(f"Distribution of {col}")
+        path = os.path.join(save_dir, f"dist_{col}.png")
+        plt.savefig(path)
         plt.close()
+        paths.append(path)
+    return paths
+def chat_with_data_scientist(message: str, history: List, api_key: str, df: Optional[pd.DataFrame] = None) -> List:
+    """Chat with GPT-4o-mini about data analysis"""
+    if not api_key:
+        return history + [
+            ("Please provide an API key to continue.", None)
+        ]
+    if df is None:
+        return history + [
+            ("Please upload a CSV file to analyze.", None)
+        ]
+    try:
+        client = openai.OpenAI(api_key=api_key)
+        # Create analysis summary
+        analysis = analyze_data(df)
+        analysis_text = f"""
+        Dataset Shape: {analysis['shape']}
+        Missing Values: {sum(analysis['missing'].values())}
+        Numeric Columns: {list(analysis['numeric_summary'].keys())}
+        """
+        messages = [
+            {"role": "system", "content": DATA_ANALYSIS_PROMPT},
+            {"role": "system", "content": f"Analysis Context:\n{analysis_text}"},
+            {"role": "user", "content": message}
+        ]
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=messages,
+            max_tokens=500
+        )
+        return history + [
+            (message, response.choices[0].message.content)
+        ]
+    except Exception as e:
+        return history + [
+            (message, f"Error: {str(e)}")
+        ]
+def create_demo():
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🔬 Data Science Expert")
+        with gr.Row():
+            with gr.Column():
+                api_key = gr.Textbox(
+                    label="GPT-4o-mini API Key",
+                    placeholder="sk-...",
+                    type="password"
+                )
+                file_input = gr.File(
+                    label="Upload CSV file",
+                    file_types=[".csv"]
+                )
+                system_prompt = gr.Textbox(
+                    label="System Prompt",
+                    value=DATA_ANALYSIS_PROMPT,
+                    lines=5
+                )
+            with gr.Column():
+                chat = gr.Chatbot(label="Analysis Chat")
+                msg = gr.Textbox(
+                    label="Ask about your data",
+                    placeholder="What insights can you find in this dataset?"
+                )
+                clear = gr.Button("Clear")
+        # Store DataFrame in state
+        df_state = gr.State(None)
+        def process_file(file):
+            if file is None:
+                return None
+            return pd.read_csv(file.name)
+        file_input.change(
+            process_file,
+            inputs=[file_input],
+            outputs=[df_state]
+        )
+        msg.submit(
+            chat_with_data_scientist,
+            inputs=[msg, chat, api_key, df_state],
+            outputs=[chat]
+        )
+        clear.click(lambda: None, None, chat)
         return demo
+demo = create_demo()
 if __name__ == "__main__":
+    demo.launch()
+else:
+    demo.launch(show_api=False)