Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 20

Commit

e279441

verified ·

1 Parent(s): 4722ac6

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -239

app.py CHANGED Viewed

@@ -2,249 +2,261 @@ import os
 import requests
 import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-import numpy as np
 from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import LabelEncoder
-##############################################################################
-# GPT-4o-mini Placeholder - Adjust for your real endpoint & JSON
-##############################################################################
-def call_gpt4o_mini(api_key, user_prompt):
-    """
-    Hypothetical call to GPT-4o-mini with an sk-... style token.
-    Example endpoint: https://api.gpt4o-mini.com/v1/chat
-    - Adjust JSON structure and keys to your actual service spec.
-    """
-    if not api_key or not api_key.startswith("sk-"):
-        return "Please provide a valid GPT-4o-mini token (sk-...)."
-    url = "https://api.gpt4o-mini.com/v1/chat"  # <--- Replace with real endpoint
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    payload = {
-        "prompt": user_prompt,
-        "max_tokens": 128,  # limit tokens for cost
-        "temperature": 0.7,
-    }
-    try:
-        response = requests.post(url, json=payload, headers=headers, timeout=10)
-        response.raise_for_status()
-        data = response.json()
-        # Suppose the text is in data["choices"][0]["text"] (adjust if needed)
-        return data["choices"][0]["text"]
-    except Exception as e:
-        return f"Error calling GPT-4o-mini: {str(e)}"
-##############################################################################
-# Local Data Analysis
-##############################################################################
-def extended_analysis(df):
-    """
-    Does correlation heatmap, bar plot for 'Career', and logistic regression
-    if 'Career' has multiple categories. Returns (list_of_image_paths, info_string).
-    """
-    output_paths = []
-    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
-    # 1) Correlation Heatmap
-    if len(numeric_cols) > 1:
-        corr = df[numeric_cols].corr()
-        plt.figure(figsize=(8, 6))
-        sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
-        plt.title("Correlation Heatmap")
-        heatmap_path = "heatmap.png"
-        plt.savefig(heatmap_path)
-        plt.close()
-        output_paths.append(heatmap_path)
-    # 2) Bar Plot for 'Career'
-    if "Career" in df.columns:
-        plt.figure(figsize=(8, 5))
-        career_counts = df["Career"].value_counts()
-        sns.barplot(x=career_counts.index, y=career_counts.values)
-        plt.title("Distribution of Careers")
-        plt.xlabel("Career")
-        plt.ylabel("Count")
-        plt.xticks(rotation=45, ha="right")
-        barplot_path = "career_distribution.png"
-        plt.savefig(barplot_path)
-        plt.close()
-        output_paths.append(barplot_path)
-    # 3) Simple Logistic Regression
-    if "Career" in df.columns and len(numeric_cols) > 0:
-        le = LabelEncoder()
-        df["Career_encoded"] = le.fit_transform(df["Career"])
-        X = df[numeric_cols].fillna(0)
-        y = df["Career_encoded"]
-        if len(np.unique(y)) > 1:
-            X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=0.2, random_state=42
             )
-            model = LogisticRegression(max_iter=1000)
-            model.fit(X_train, y_train)
-            score = model.score(X_test, y_test)
-            accuracy_info = f"Logistic Regression accuracy: {score:.2f}"
         else:
-            accuracy_info = "Only one category in 'Career'; no classification performed."
-    else:
-        accuracy_info = "No 'Career' column or insufficient numeric columns for classification."
-    return output_paths, accuracy_info
-##############################################################################
-# Main Chat/Analysis Function
-##############################################################################
-def handle_chat(user_message, df, chat_history, api_key):
-    """
-    - If df is None, prompt user to upload a CSV.
-    - Else, do local analysis and optionally call GPT-4o-mini for suggestions.
-    - Update the chat_history with role='user' or role='assistant' messages.
-    - Return new chat_history in 'messages' format for the Gradio Chatbot (type='messages').
-    """
-    if df is None:
-        chat_history.append({"role": "assistant", "content": "Please upload a CSV first."})
-        return chat_history
-    # Summarize data
-    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
-    cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
-    summary = (
-        f"Rows: {df.shape[0]}, Columns: {df.shape[1]}\n"
-        f"Numeric: {', '.join(numeric_cols) if numeric_cols else 'None'}\n"
-        f"Categorical: {', '.join(cat_cols) if cat_cols else 'None'}"
-    )
-    # Always show user message in chat
-    chat_history.append({"role": "user", "content": user_message})
-    # Possibly call GPT-4o-mini for suggestions
-    gpt_reply = ""
-    if api_key:
-        prompt = f"Data Summary:\n{summary}\nUser Query: {user_message}"
-        gpt_reply = call_gpt4o_mini(api_key, prompt)
-    # Build the reply text (local summary + LLM suggestions)
-    reply_text = f"**Data Summary**:\n{summary}"
-    if gpt_reply:
-        reply_text += f"\n\n**GPT-4o-mini**: {gpt_reply}"
-    # Check if user wants extended analysis
-    triggers = ["sample analysis", "extended analysis", "advanced analysis", "run analysis", "visualize", "plot"]
-    if any(t in user_message.lower() for t in triggers):
-        # Perform extended analysis
-        image_paths, info = extended_analysis(df)
-        if info:
-            reply_text += f"\n\n**Analysis Info**: {info}"
-        # Add images to chat
-        chat_history.append({"role": "assistant", "content": reply_text})
-        # Return images as separate chat items
-        for path in image_paths:
-            chat_history.append({"role": "assistant", "content": None, "image": path})
-        return chat_history
-    # If no extended analysis triggered, just add the text
-    chat_history.append({"role": "assistant", "content": reply_text})
-    return chat_history
-##############################################################################
-# Gradio Interface
-##############################################################################
-def create_demo():
-    with gr.Blocks() as demo:
-        # State: holds the DataFrame and the chat messages
-        df_state = gr.State(None)
-        chat_state = gr.State([])  # store messages as list of dicts: [{"role": "...", "content": "..."}]
-        gr.Markdown("## GPT-4o-mini Data Analysis Assistant (Chat)")
-        gr.Markdown(
-            """
-            1. Enter your GPT-4o-mini token (`sk-...`) if you want AI suggestions.
-            2. Upload a CSV file.
-            3. Ask questions or request "sample analysis", "visualize", etc.
-            4. Images are displayed in the chat when relevant.
-            """
-        )
-        api_key_box = gr.Textbox(label="GPT-4o-mini Token (sk-...)", placeholder="Optional: sk-xxxx")
-        file_input = gr.File(label="Upload CSV", file_types=[".csv"])
-        # Chatbot in "messages" format to fix the deprecation warning
-        chatbot = gr.Chatbot(label="Chat Output", type="messages")
-        user_message = gr.Textbox(label="Your Message", placeholder="Ask about your data...")
-        def upload_csv(file):
-            """
-            On file upload, load the DataFrame into df_state and reset the chat if needed.
-            """
-            if file is None:
-                return None
-            df = pd.read_csv(file.name)
-            return df
-        file_input.change(fn=upload_csv, inputs=file_input, outputs=df_state)
-        def on_user_message(message, df, chat_history, api_key):
-            """
-            Called when user sends a message. Handle chat + analysis. Return new chat messages.
-            """
-            if not message.strip():
-                return chat_history  # ignore empty
-            updated_history = handle_chat(message, df, chat_history, api_key)
-            return updated_history
-        user_message.submit(
-            fn=on_user_message,
-            inputs=[user_message, df_state, chat_state, api_key_box],
-            outputs=chat_state
-        ).then(
-            # After updating chat_state, reflect it in the chatbot
-            fn=lambda messages: messages,
-            inputs=chat_state,
-            outputs=chatbot
-        ).then(
-            fn=lambda: "",
-            outputs=user_message
-        )
-        # Button to send message
-        send_btn = gr.Button("Send")
-        send_btn.click(
-            fn=on_user_message,
-            inputs=[user_message, df_state, chat_state, api_key_box],
-            outputs=chat_state
-        ).then(
-            fn=lambda messages: messages,
-            inputs=chat_state,
-            outputs=chatbot
-        ).then(
-            fn=lambda: "",
-            outputs=user_message
-        )
-        # Clear chat button
-        clear_btn = gr.Button("Clear Chat")
-        def clear_chat():
-            return [], []
-        clear_btn.click(
-            fn=clear_chat,
-            inputs=[],
-            outputs=[chat_state, chatbot]
-        )
-    return demo
-demo = create_demo()
 if __name__ == "__main__":
-    demo.launch(share=True)

 import requests
 import gradio as gr
 import pandas as pd
+import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
+from typing import Dict, List, Tuple, Optional
+from dataclasses import dataclass
+from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.impute import SimpleImputer
+import statsmodels.api as sm
+import plotly.express as px
+import plotly.graph_objects as go
+from scipy import stats
+@dataclass
+class AnalysisConfig:
+    """Configuration for analysis parameters"""
+    max_iterations: int = 5
+    min_samples_for_analysis: int = 30
+    correlation_threshold: float = 0.7
+    max_categories_for_viz: int = 10
+    significance_level: float = 0.05
+class DataAnalyzer:
+    """Intelligent data analysis agent that determines appropriate visualizations and analyses"""
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.config = AnalysisConfig()
+        self.current_iteration = 0
+        self.analysis_results = []
+    def call_gpt4o_mini(self, prompt: str) -> str:
+        """Call GPT-4o-mini API with proper error handling"""
+        try:
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+            response = requests.post(
+                "https://api.gpt4o-mini.example.com/v1/chat",  # Replace with actual endpoint
+                json={"prompt": prompt, "max_tokens": 500, "temperature": 0.7},
+                headers=headers,
+                timeout=15
             )
+            response.raise_for_status()
+            return response.json()["choices"][0]["text"]
+        except Exception as e:
+            return f"API Error: {str(e)}"
+    def analyze_data_types(self, df: pd.DataFrame) -> Dict:
+        """Analyze data types and basic statistics of the DataFrame"""
+        analysis = {
+            "numeric_cols": df.select_dtypes(include=['int64', 'float64']).columns.tolist(),
+            "categorical_cols": df.select_dtypes(include=['object', 'category']).columns.tolist(),
+            "temporal_cols": df.select_dtypes(include=['datetime64']).columns.tolist(),
+            "missing_values": df.isnull().sum().to_dict(),
+            "unique_counts": df.nunique().to_dict()
+        }
+        return analysis
+    def create_visualization(self, df: pd.DataFrame, viz_type: str, columns: List[str]) -> str:
+        """Create and save visualization based on data types and relationships"""
+        plt.figure(figsize=(10, 6))
+        if viz_type == "correlation":
+            sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm')
+            plt.title("Correlation Matrix")
+        elif viz_type == "distribution":
+            for col in columns:
+                sns.histplot(data=df, x=col, kde=True)
+                plt.title(f"Distribution of {col}")
+        elif viz_type == "boxplot":
+            sns.boxplot(data=df[columns])
+            plt.title("Box Plot of Numeric Variables")
+        output_path = f"viz_{self.current_iteration}.png"
+        plt.savefig(output_path)
+        plt.close()
+        return output_path
+    def perform_statistical_tests(self, df: pd.DataFrame, data_types: Dict) -> Dict:
+        """Perform relevant statistical tests based on data types"""
+        results = {}
+        # Normality tests for numeric columns
+        for col in data_types["numeric_cols"]:
+            if len(df[col].dropna()) > 3:
+                stat, p_value = stats.normaltest(df[col].dropna())
+                results[f"normality_{col}"] = {
+                    "statistic": stat,
+                    "p_value": p_value,
+                    "is_normal": p_value > self.config.significance_level
+                }
+        # Chi-square tests for categorical columns
+        for col1 in data_types["categorical_cols"]:
+            for col2 in data_types["categorical_cols"]:
+                if col1 < col2:
+                    contingency = pd.crosstab(df[col1], df[col2])
+                    chi2, p_value, _, _ = stats.chi2_contingency(contingency)
+                    results[f"chi2_{col1}_{col2}"] = {
+                        "statistic": chi2,
+                        "p_value": p_value,
+                        "is_significant": p_value < self.config.significance_level
+                    }
+        return results
+    def train_predictive_model(self, df: pd.DataFrame, target_col: str) -> Tuple[float, str]:
+        """Train and evaluate a predictive model based on data characteristics"""
+        X = df.drop(columns=[target_col])
+        y = df[target_col]
+        # Preprocessing
+        numeric_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='median')),
+            ('scaler', StandardScaler())
+        ])
+        categorical_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+            ('onehot', OneHotEncoder(handle_unknown='ignore'))
+        ])
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
+                ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
+            ])
+        if len(np.unique(y)) <= 5:  # Classification
+            model = RandomForestClassifier(n_estimators=100, random_state=42)
+            metric = 'accuracy'
+        else:  # Regression
+            model = RandomForestRegressor(n_estimators=100, random_state=42)
+            metric = 'r2'
+        pipeline = Pipeline([
+            ('preprocessor', preprocessor),
+            ('model', model)
+        ])
+        # Train and evaluate
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        pipeline.fit(X_train, y_train)
+        y_pred = pipeline.predict(X_test)
+        if metric == 'accuracy':
+            score = accuracy_score(y_test, y_pred)
         else:
+            score = r2_score(y_test, y_pred)
+        return score, metric
+class GradioInterface:
+    """Gradio interface for the data analysis agent"""
+    def __init__(self):
+        self.analyzer = None
+        self.df = None
+    def create_interface(self):
+        with gr.Blocks() as demo:
+            gr.Markdown("# Intelligent Data Analysis Agent")
+            with gr.Row():
+                api_key = gr.Textbox(label="GPT-4o-mini API Key", type="password")
+                file_input = gr.File(label="Upload CSV file")
+            with gr.Row():
+                analysis_notes = gr.Textbox(label="Analysis Notes (Optional)",
+                                          placeholder="Any specific analysis preferences...")
+            with gr.Row():
+                analyze_btn = gr.Button("Analyze Data")
+                clear_btn = gr.Button("Clear")
+            output_text = gr.Markdown()
+            output_gallery = gr.Gallery()
+            def analyze(api_key, file, notes):
+                if not api_key or not file:
+                    return "Please provide both API key and data file.", None
+                try:
+                    self.df = pd.read_csv(file.name)
+                    self.analyzer = DataAnalyzer(api_key)
+                    # Get AI suggestions for analysis
+                    prompt = f"Data columns: {list(self.df.columns)}\nUser notes: {notes}\nSuggest appropriate analyses and visualizations."
+                    ai_suggestions = self.analyzer.call_gpt4o_mini(prompt)
+                    # Perform analysis
+                    data_types = self.analyzer.analyze_data_types(self.df)
+                    stats_results = self.analyzer.perform_statistical_tests(self.df, data_types)
+                    # Create visualizations
+                    viz_paths = []
+                    for viz_type in ["correlation", "distribution", "boxplot"]:
+                        if data_types["numeric_cols"]:
+                            path = self.analyzer.create_visualization(
+                                self.df, viz_type, data_types["numeric_cols"]
+                            )
+                            viz_paths.append(path)
+                    # Generate summary
+                    summary = f"""
+                    ## Data Analysis Results
+                    ### AI Suggestions
+                    {ai_suggestions}
+                    ### Basic Statistics
+                    - Rows: {len(self.df)}
+                    - Columns: {len(self.df.columns)}
+                    - Missing Values: {sum(data_types['missing_values'].values())}
+                    ### Statistical Tests
+                    {self._format_stats_results(stats_results)}
+                    """
+                    return summary, viz_paths
+                except Exception as e:
+                    return f"Error during analysis: {str(e)}", None
+            analyze_btn.click(
+                analyze,
+                inputs=[api_key, file_input, analysis_notes],
+                outputs=[output_text, output_gallery]
+            )
+            clear_btn.click(
+                lambda: (None, None),
+                outputs=[output_text, output_gallery]
+            )
+        return demo
+    @staticmethod
+    def _format_stats_results(results: Dict) -> str:
+        """Format statistical results for display"""
+        formatted = []
+        for test_name, result in results.items():
+            if "normality" in test_name:
+                formatted.append(f"- {test_name}: {'Normal' if result['is_normal'] else 'Non-normal'} "
+                               f"(p={result['p_value']:.4f})")
+            elif "chi2" in test_name:
+                formatted.append(f"- {test_name}: {'Significant' if result['is_significant'] else 'Not significant'} "
+                               f"(p={result['p_value']:.4f})")
+        return "\n".join(formatted)
 if __name__ == "__main__":
+    interface = GradioInterface()
+    demo = interface.create_interface()
+    demo.launch(share=True)