import os import requests import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from typing import Dict, List, Tuple, Optional from dataclasses import dataclass from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score, accuracy_score from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.impute import SimpleImputer import statsmodels.api as sm import plotly.express as px import plotly.graph_objects as go from scipy import stats @dataclass class AnalysisConfig: """Configuration for analysis parameters""" max_iterations: int = 5 min_samples_for_analysis: int = 30 correlation_threshold: float = 0.7 max_categories_for_viz: int = 10 significance_level: float = 0.05 class DataAnalyzer: """Intelligent data analysis agent that determines appropriate visualizations and analyses""" def __init__(self, api_key: str): self.api_key = api_key self.config = AnalysisConfig() self.current_iteration = 0 self.analysis_results = [] def call_gpt4o_mini(self, prompt: str, system_prompt: str) -> str: """Call GPT-4o-mini API with proper error handling""" try: client = openai.OpenAI(api_key=self.api_key) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ] response = client.chat.completions.create( model="gpt-4o-mini", messages=messages, max_tokens=500, temperature=0.7 ) return response.choices[0].message.content except Exception as e: return f"API Error: {str(e)}" def analyze_data_types(self, df: pd.DataFrame) -> Dict: """Analyze data types and basic statistics of the DataFrame""" analysis = { "numeric_cols": df.select_dtypes(include=['int64', 'float64']).columns.tolist(), "categorical_cols": df.select_dtypes(include=['object', 'category']).columns.tolist(), "temporal_cols": df.select_dtypes(include=['datetime64']).columns.tolist(), "missing_values": df.isnull().sum().to_dict(), "unique_counts": df.nunique().to_dict() } return analysis def create_visualization(self, df: pd.DataFrame, viz_type: str, columns: List[str]) -> str: """Create and save visualization based on data types and relationships""" plt.figure(figsize=(10, 6)) if viz_type == "correlation": sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm') plt.title("Correlation Matrix") elif viz_type == "distribution": for col in columns: sns.histplot(data=df, x=col, kde=True) plt.title(f"Distribution of {col}") elif viz_type == "boxplot": sns.boxplot(data=df[columns]) plt.title("Box Plot of Numeric Variables") output_path = f"viz_{self.current_iteration}.png" plt.savefig(output_path) plt.close() return output_path def perform_statistical_tests(self, df: pd.DataFrame, data_types: Dict) -> Dict: """Perform relevant statistical tests based on data types""" results = {} # Normality tests for numeric columns for col in data_types["numeric_cols"]: if len(df[col].dropna()) > 3: stat, p_value = stats.normaltest(df[col].dropna()) results[f"normality_{col}"] = { "statistic": stat, "p_value": p_value, "is_normal": p_value > self.config.significance_level } # Chi-square tests for categorical columns for col1 in data_types["categorical_cols"]: for col2 in data_types["categorical_cols"]: if col1 < col2: contingency = pd.crosstab(df[col1], df[col2]) chi2, p_value, _, _ = stats.chi2_contingency(contingency) results[f"chi2_{col1}_{col2}"] = { "statistic": chi2, "p_value": p_value, "is_significant": p_value < self.config.significance_level } return results def train_predictive_model(self, df: pd.DataFrame, target_col: str) -> Tuple[float, str]: """Train and evaluate a predictive model based on data characteristics""" X = df.drop(columns=[target_col]) y = df[target_col] # Preprocessing numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns), ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns) ]) if len(np.unique(y)) <= 5: # Classification model = RandomForestClassifier(n_estimators=100, random_state=42) metric = 'accuracy' else: # Regression model = RandomForestRegressor(n_estimators=100, random_state=42) metric = 'r2' pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', model) ]) # Train and evaluate X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) if metric == 'accuracy': score = accuracy_score(y_test, y_pred) else: score = r2_score(y_test, y_pred) return score, metric class GradioInterface: """Gradio interface for the data analysis agent""" def __init__(self): self.analyzer = None self.df = None DEFAULT_SYSTEM_PROMPT = """ You are an expert data scientist and analyst who combines technical precision with clear communication. You specialize in uncovering insights through advanced statistical analysis, machine learning, and data visualization. Advanced statistical analysis and hypothesis testing Machine learning model development and evaluation Data visualization and exploratory data analysis Pattern recognition and trend identification Feature engineering and selection Data Quality Assessment Exploratory Data Analysis Statistical Testing Pattern Recognition Insight Generation Visualization Creation Recommendations Development
Key Findings Summary
Detailed Statistical Analysis
Visualization Descriptions
Actionable Recommendations
Always explain statistical significance Provide context for numerical findings Highlight practical implications Address data limitations
""" def create_interface(self): with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🔍 Intelligent Data Analysis Agent") with gr.Row(): with gr.Column(scale=1): api_key = gr.Textbox( label="GPT-4o-mini API Key", type="password", placeholder="sk-..." ) file_input = gr.File( label="Upload CSV file" ) with gr.Accordion("⚙️ Advanced Settings", open=False): system_prompt = gr.TextArea( label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=8 ) with gr.Row(): analysis_notes = gr.Textbox( label="Analysis Notes (Optional)", placeholder="Any specific analysis preferences...") with gr.Row(): analyze_btn = gr.Button("Analyze Data") clear_btn = gr.Button("Clear") output_text = gr.Markdown() output_gallery = gr.Gallery() def analyze(api_key, file, notes, system_prompt): if not api_key or not file: return "Please provide both API key and data file.", None try: self.df = pd.read_csv(file.name) self.analyzer = DataAnalyzer(api_key) # Get AI suggestions for analysis prompt = f"Data columns: {list(self.df.columns)}\nUser notes: {notes}\nSuggest appropriate analyses and visualizations." ai_suggestions = self.analyzer.call_gpt4o_mini(prompt) # Perform analysis data_types = self.analyzer.analyze_data_types(self.df) stats_results = self.analyzer.perform_statistical_tests(self.df, data_types) # Create visualizations viz_paths = [] for viz_type in ["correlation", "distribution", "boxplot"]: if data_types["numeric_cols"]: path = self.analyzer.create_visualization( self.df, viz_type, data_types["numeric_cols"] ) viz_paths.append(path) # Generate summary summary = f""" ## Data Analysis Results ### AI Suggestions {ai_suggestions} ### Basic Statistics - Rows: {len(self.df)} - Columns: {len(self.df.columns)} - Missing Values: {sum(data_types['missing_values'].values())} ### Statistical Tests {self._format_stats_results(stats_results)} """ return summary, viz_paths except Exception as e: return f"Error during analysis: {str(e)}", None analyze_btn.click( analyze, inputs=[api_key, file_input, analysis_notes, system_prompt], outputs=[output_text, output_gallery] ) clear_btn.click( lambda: (None, None), outputs=[output_text, output_gallery] ) return demo @staticmethod def _format_stats_results(results: Dict) -> str: """Format statistical results for display""" formatted = [] for test_name, result in results.items(): if "normality" in test_name: formatted.append(f"- {test_name}: {'Normal' if result['is_normal'] else 'Non-normal'} " f"(p={result['p_value']:.4f})") elif "chi2" in test_name: formatted.append(f"- {test_name}: {'Significant' if result['is_significant'] else 'Not significant'} " f"(p={result['p_value']:.4f})") return "\n".join(formatted) if __name__ == "__main__": interface = GradioInterface() demo = interface.create_interface() demo.launch(share=True)