Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 20

Commit

5293476

verified ·

1 Parent(s): ad9e004

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -192

app.py CHANGED Viewed

@@ -1,216 +1,141 @@
-from transformers import Tool, ReactCodeAgent, HfApiEngine
-import gradio as gr
 import pandas as pd
 import numpy as np
 import plotly.express as px
-import plotly.graph_objects as go
-from typing import Dict, List, Optional
-import openai
 import seaborn as sns
-import matplotlib.pyplot as plt
-import io
-import base64
-# Custom Tools for Data Analysis
-class DataVisualizationTool(Tool):
-    name = "data_visualizer"
-    description = """Creates various types of visualizations from data:
-    - Correlation heatmaps
-    - Distribution plots
-    - Scatter plots
-    - Time series plots
-    Returns the plots as base64 encoded images."""
     inputs = {
-        "data": {
-            "type": "dict",
-            "description": "DataFrame as dictionary"
-        },
-        "plot_type": {
-            "type": "string",
-            "description": "Type of plot to create: 'heatmap', 'distribution', 'scatter'"
-        },
-        "columns": {
-            "type": "list",
-            "description": "List of columns to plot"
-        }
     }
-    output_type = "string"  # base64 encoded image
-    def forward(self, data: Dict, plot_type: str, columns: List[str]) -> str:
         df = pd.DataFrame(data)
-        plt.figure(figsize=(10, 6))
-        if plot_type == "heatmap":
-            sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm')
-            plt.title("Correlation Heatmap")
-        elif plot_type == "distribution":
-            for col in columns:
-                sns.histplot(df[col], kde=True, label=col)
-            plt.title("Distribution Plot")
-            plt.legend()
-        elif plot_type == "scatter":
-            if len(columns) >= 2:
-                sns.scatterplot(data=df, x=columns[0], y=columns[1])
-                plt.title(f"Scatter Plot: {columns[0]} vs {columns[1]}")
-        # Convert plot to base64
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png')
-        plt.close()
-        buf.seek(0)
-        return base64.b64encode(buf.read()).decode('utf-8')
-class DataAnalysisTool(Tool):
-    name = "data_analyzer"
-    description = """Performs statistical analysis on data:
-    - Basic statistics (mean, median, std)
-    - Correlation analysis
-    - Missing value analysis
-    - Outlier detection"""
     inputs = {
-        "data": {
-            "type": "dict",
-            "description": "DataFrame as dictionary"
-        },
-        "analysis_type": {
-            "type": "string",
-            "description": "Type of analysis: 'basic', 'correlation', 'missing', 'outliers'"
-        },
-        "columns": {
-            "type": "list",
-            "description": "List of columns to analyze"
-        }
     }
     output_type = "dict"
-    def forward(self, data: Dict, analysis_type: str, columns: List[str]) -> Dict:
         df = pd.DataFrame(data)
-        selected_cols = [col for col in columns if col in df.columns]
-        if analysis_type == "basic":
-            return {
-                "statistics": df[selected_cols].describe().to_dict(),
-                "skew": df[selected_cols].skew().to_dict(),
-                "kurtosis": df[selected_cols].kurtosis().to_dict()
-            }
-        elif analysis_type == "correlation":
-            numeric_cols = df[selected_cols].select_dtypes(include=[np.number])
-            return {
-                "correlation": numeric_cols.corr().to_dict(),
-                "covariance": numeric_cols.cov().to_dict()
-            }
-        elif analysis_type == "missing":
             return {
-                "missing_counts": df[selected_cols].isnull().sum().to_dict(),
-                "missing_percentages": (df[selected_cols].isnull().mean() * 100).to_dict()
             }
-        elif analysis_type == "outliers":
-            outliers = {}
-            for col in selected_cols:
-                if df[col].dtype in [np.float64, np.int64]:
-                    Q1 = df[col].quantile(0.25)
-                    Q3 = df[col].quantile(0.75)
-                    IQR = Q3 - Q1
-                    outliers[col] = {
-                        "outliers_count": len(df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]),
-                        "lower_bound": Q1 - 1.5 * IQR,
-                        "upper_bound": Q3 + 1.5 * IQR
-                    }
-            return {"outliers": outliers}
-def create_demo():
-    # Initialize tools
-    viz_tool = DataVisualizationTool()
-    analysis_tool = DataAnalysisTool()
-    # Create agent with tools
-    llm_engine = HfApiEngine()  # Uses default model
-    agent = ReactCodeAgent(
-        tools=[viz_tool, analysis_tool],
-        llm_engine=llm_engine
-    )
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🔬 Advanced Data Analysis Agent")
-        with gr.Row():
-            with gr.Column():
-                api_key = gr.Textbox(
-                    label="OpenAI API Key",
-                    type="password",
-                    placeholder="sk-..."
-                )
-                file_input = gr.File(
-                    label="Upload CSV",
-                    file_types=[".csv"]
-                )
-                with gr.Accordion("Advanced Settings", open=False):
-                    system_prompt = gr.Textbox(
-                        label="System Prompt",
-                        value="""You are a data science expert. Analyze the data and create
-                               visualizations to help understand patterns and insights.""",
-                        lines=3
-                    )
-            with gr.Column():
-                chat = gr.Chatbot(label="Analysis Chat")
-                msg = gr.Textbox(
-                    label="Ask about your data",
-                    placeholder="What insights can you find in this dataset?"
-                )
-                clear = gr.Button("Clear")
-        # State for storing the DataFrame
-        df_state = gr.State(None)
-        def process_file(file):
-            if file is None:
-                return None
-            return pd.read_csv(file.name)
-        def process_message(message, chat_history, api_key, df):
-            if df is None:
-                return chat_history + [(message, "Please upload a CSV file first.")]
-            try:
-                # Convert DataFrame to dict for tools
-                data_dict = df.to_dict()
-                # Get all columns for potential analysis
-                columns = list(df.columns)
-                # Use agent to analyze and create visualizations
-                response = agent.run(
-                    f"""Analyze this data: {message}
-                    Available columns: {columns}
-                    Use the data_analyzer and data_visualizer tools to create insights."""
-                )
-                return chat_history + [(message, response)]
-            except Exception as e:
-                return chat_history + [(message, f"Error: {str(e)}")]
-        file_input.change(
-            process_file,
-            inputs=[file_input],
-            outputs=[df_state]
-        )
-        msg.submit(
-            process_message,
-            inputs=[msg, chat, api_key, df_state],
-            outputs=[chat]
-        )
-        clear.click(lambda: None, None, chat)
-        return demo
-if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch()
-else:
-    demo.launch(show_api=False)

+from transformers import Tool
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import seaborn as sns
+from sklearn import preprocessing, decomposition, metrics
+# 1. Data Loading and Preprocessing Tool
+class DataPreprocessingTool(Tool):
+    name = "data_preprocessor"
+    description = "Handles data loading, cleaning, and preprocessing tasks"
     inputs = {
+        "data": {"type": "dict", "description": "Input data dictionary"},
+        "operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"}
     }
+    output_type = "dict"
+    def forward(self, data: dict, operation: str) -> dict:
         df = pd.DataFrame(data)
+        if operation == "clean":
+            # Handle duplicates, missing values
+            df = df.drop_duplicates()
+            df = df.fillna(df.mean(numeric_only=True))
+        elif operation == "encode":
+            # Encode categorical variables
+            le = preprocessing.LabelEncoder()
+            for col in df.select_dtypes(include=['object']):
+                df[col] = le.fit_transform(df[col].astype(str))
+        elif operation == "normalize":
+            # Normalize numeric columns
+            scaler = preprocessing.StandardScaler()
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
+        return df.to_dict()
+# 2. Statistical Analysis Tool
+class StatisticalAnalysisTool(Tool):
+    name = "statistical_analyzer"
+    description = "Performs statistical analysis on data"
     inputs = {
+        "data": {"type": "dict", "description": "Input data dictionary"},
+        "analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"}
     }
     output_type = "dict"
+    def forward(self, data: dict, analysis_type: str) -> dict:
         df = pd.DataFrame(data)
+        if analysis_type == "descriptive":
             return {
+                "summary": df.describe().to_dict(),
+                "skewness": df.skew().to_dict(),
+                "kurtosis": df.kurtosis().to_dict()
             }
+        elif analysis_type == "inferential":
+            # Perform statistical tests
+            results = {}
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            for col in numeric_cols:
+                from scipy import stats
+                stat, p_value = stats.normaltest(df[col].dropna())
+                results[col] = {"statistic": stat, "p_value": p_value}
+            return results
+        return df.corr().to_dict()
+# 3. Advanced Visualization Tool
+class AdvancedVisualizationTool(Tool):
+    name = "advanced_visualizer"
+    description = "Creates advanced statistical and ML visualizations"
+    inputs = {
+        "data": {"type": "dict", "description": "Input data dictionary"},
+        "viz_type": {"type": "string", "description": "Type of visualization"},
+        "params": {"type": "dict", "description": "Additional parameters"}
+    }
+    output_type = "dict"
+    def forward(self, data: dict, viz_type: str, params: dict) -> dict:
+        df = pd.DataFrame(data)
+        if viz_type == "pca":
+            # PCA visualization
+            pca = decomposition.PCA(n_components=2)
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            pca_result = pca.fit_transform(df[numeric_cols])
+            fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1],
+                           title='PCA Visualization')
+            return {"plot": fig.to_dict()}
+        elif viz_type == "cluster":
+            # Clustering visualization
+            from sklearn.cluster import KMeans
+            kmeans = KMeans(n_clusters=params.get("n_clusters", 3))
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            clusters = kmeans.fit_predict(df[numeric_cols])
+            fig = px.scatter(df, x=params.get("x"), y=params.get("y"),
+                           color=clusters, title='Cluster Visualization')
+            return {"plot": fig.to_dict()}
+        return {}
+# 4. Machine Learning Tool
+class MLModelTool(Tool):
+    name = "ml_modeler"
+    description = "Trains and evaluates machine learning models"
+    inputs = {
+        "data": {"type": "dict", "description": "Input data dictionary"},
+        "target": {"type": "string", "description": "Target column name"},
+        "model_type": {"type": "string", "description": "Type of model to train"}
+    }
+    output_type = "dict"
+    def forward(self, data: dict, target: str, model_type: str) -> dict:
+        from sklearn.model_selection import train_test_split
+        from sklearn.metrics import mean_squared_error, accuracy_score
+        df = pd.DataFrame(data)
+        X = df.drop(columns=[target])
+        y = df[target]
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+        if model_type == "regression":
+            from sklearn.linear_model import LinearRegression
+            model = LinearRegression()
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_test)
+            return {
+                "mse": mean_squared_error(y_test, y_pred),
+                "r2": model.score(X_test, y_test),
+                "coefficients": dict(zip(X.columns, model.coef_))
+            }
+        elif model_type == "classification":
+            from sklearn.ensemble import RandomForestClassifier
+            model = RandomForestClassifier()
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_test)
+            return {
+                "accuracy": accuracy_score(y_test, y_pred),
+                "feature_importance": dict(zip(X.columns, model.feature_importances_))
+            }
+        return {}