Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 20

Commit

e7486fb

verified ·

1 Parent(s): e826aa2

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -267

app.py CHANGED Viewed

@@ -1,289 +1,210 @@
 import os
-import shutil
-import gradio as gr
 import pandas as pd
-import numpy as np
 import plotly.express as px
-from typing import Dict, List, Optional, Tuple
-from dataclasses import dataclass
-from openai import OpenAI  # OpenAI API client
-# Configuration class for agent settings
-@dataclass
-class AgentConfig:
-    """Configuration for the data science agent"""
-    system_prompt: str = """
-    You are an expert data scientist. Analyze the data and provide insights.
-    Your responses should be clear, concise, and actionable. Always provide explanations
-    for your analysis and include visualizations when appropriate.
-    """
-    max_iterations: int = 10
-    temperature: float = 0.7
-    model_name: str = "gpt-4"  # Use GPT-4 or another valid model
-# Data Analysis State class
-@dataclass
-class AnalysisState:
-    """Maintains state for ongoing analysis"""
-    df: Optional[pd.DataFrame] = None
-    current_analysis: Dict = None
-    visualizations: List[Dict] = None
-    error_log: List[str] = None
-    def clear(self):
-        self.df = None
-        self.current_analysis = None
-        self.visualizations = None
-        self.error_log = []
-    def log_error(self, error: str):
-        if self.error_log is None:
-            self.error_log = []
-        self.error_log.append(error)
-# Helper functions for data processing
-def process_uploaded_file(file) -> Tuple[Optional[pd.DataFrame], Dict]:
-    """Process uploaded file and return DataFrame with info"""
     try:
-        if file.name.endswith('.csv'):
-            df = pd.read_csv(file.name)
-        elif file.name.endswith('.xlsx'):
-            df = pd.read_excel(file.name)
-        elif file.name.endswith('.json'):
-            df = pd.read_json(file.name)
-        else:
-            return None, {"error": "Unsupported file format"}
-        info = {
-            "shape": df.shape,
-            "columns": list(df.columns),
-            "dtypes": df.dtypes.to_dict(),
-            "missing_values": df.isnull().sum().to_dict(),
-            "numeric_columns": list(df.select_dtypes(include=[np.number]).columns),
-            "categorical_columns": list(df.select_dtypes(exclude=[np.number]).columns)
-        }
-        return df, info
     except Exception as e:
-        return None, {"error": str(e)}
-def create_visualization(data: pd.DataFrame, viz_type: str, params: Dict) -> Optional[str]:
-    """Create visualization based on type and parameters"""
     try:
-        if viz_type == "scatter":
-            fig = px.scatter(
-                data,
-                x=params["x"],
-                y=params["y"],
-                color=params.get("color"),
-                title=params.get("title", "Scatter Plot")
-            )
-        elif viz_type == "histogram":
-            fig = px.histogram(
-                data,
-                x=params["x"],
-                nbins=params.get("nbins", 30),
-                title=params.get("title", "Distribution")
-            )
-        elif viz_type == "heatmap":
-            numeric_cols = data.select_dtypes(include=[np.number]).columns
-            corr = data[numeric_cols].corr()
-            fig = px.imshow(
-                corr,
-                labels=dict(color="Correlation"),
-                title=params.get("title", "Correlation Heatmap")
-            )
-        else:
-            return None
-        # Convert Plotly figure to HTML
-        return fig.to_html(full_html=False)
     except Exception as e:
-        return {"error": str(e)}
-def load_example_data(dataset_name: str = "iris") -> Tuple[pd.DataFrame, Dict]:
-    """Load example dataset (Iris or Diabetes)"""
-    try:
-        if dataset_name == "iris":
-            data = load_iris()
-            df = pd.DataFrame(data.data, columns=data.feature_names)
-            df['target'] = data.target
-        elif dataset_name == "diabetes":
-            data = load_diabetes()
-            df = pd.DataFrame(data.data, columns=data.feature_names)
-            df['target'] = data.target
-        else:
-            return None, {"error": "Invalid dataset name"}
-        info = {
-            "shape": df.shape,
-            "columns": list(df.columns),
-            "dtypes": df.dtypes.to_dict(),
-            "missing_values": df.isnull().sum().to_dict(),
-            "numeric_columns": list(df.select_dtypes(include=[np.number]).columns),
-            "categorical_columns": list(df.select_dtypes(exclude=[np.number]).columns)
-        }
-        return df, info
-    except Exception as e:
-        return None, {"error": str(e)}
-def query_openai(api_key: str, system_prompt: str, user_prompt: str) -> str:
-    """Query OpenAI API with the given prompts"""
-    try:
-        client = OpenAI(api_key=api_key)
-        response = client.chat.completions.create(
-            model="gpt-4",  # Use GPT-4 or another valid model
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt}
-            ],
-            temperature=0.7,
-            max_tokens=500
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        return f"Error querying OpenAI API: {str(e)}"
-def create_demo():
-    # Initialize configuration and state
-    config = AgentConfig()
-    analysis_state = AnalysisState()
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🔬 Advanced Data Science Agent")
-        with gr.Row():
-            with gr.Column(scale=1):
-                api_key = gr.Textbox(
-                    label="OpenAI API Key",
-                    type="password",
-                    placeholder="Enter your OpenAI API key"
-                )
-                file_input = gr.File(
-                    label="Upload Data",
-                    file_types=[".csv", ".xlsx", ".json"]
-                )
-                example_btn = gr.Button("Load Example Dataset")
-                with gr.Accordion("Visualization Settings", open=False):
-                    viz_type = gr.Dropdown(
-                        choices=["scatter", "histogram", "heatmap"],
-                        label="Visualization Type",
-                        value="scatter"
-                    )
-                    x_axis = gr.Dropdown(label="X-axis", interactive=True)
-                    y_axis = gr.Dropdown(label="Y-axis", interactive=True)
-                    color_column = gr.Dropdown(label="Color Column", interactive=True)
-                with gr.Accordion("System Prompt", open=False):
-                    system_prompt = gr.Textbox(
-                        label="System Prompt",
-                        value=config.system_prompt,
-                        lines=5
-                    )
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(label="Analysis Chat", height=300)
-                with gr.Row():
-                    chat_input = gr.Textbox(
-                        label="Ask about your data",
-                        placeholder="Type your question here...",
-                        lines=2
-                    )
-                    submit_btn = gr.Button("Send", variant="primary")
-                plot_output = gr.HTML(label="Generated Plots")
-                results_json = gr.JSON(label="Analysis Results")
-                error_output = gr.Textbox(label="Error Log", visible=False)
-        # Event handlers
-        def handle_file_upload(file):
-            if file is None:
-                return None, None, None, "No file uploaded"
-            df, info = process_uploaded_file(file)
-            if df is not None:
-                analysis_state.df = df
-                analysis_state.current_analysis = info
-                return info, list(df.columns), list(df.columns), None
-            return None, None, None, "Failed to load file"
-        def handle_example_data():
-            df, info = load_example_data("iris")
-            if df is not None:
-                analysis_state.df = df
-                analysis_state.current_analysis = info
-                return info, list(df.columns), list(df.columns), None
-            return None, None, None, "Failed to load example data"
-        def handle_visualization(viz_type, x_axis, y_axis, color_column):
-            if analysis_state.df is None:
-                return None, "No data available"
-            params = {"x": x_axis, "y": y_axis, "color": color_column}
-            fig_html = create_visualization(analysis_state.df, viz_type, params)
-            if fig_html is not None:
-                return fig_html, None
-            return None, "Failed to create visualization"
-        def handle_chat_message(api_key, system_prompt, message, chat_history):
-            if analysis_state.df is None:
-                return chat_history + [(message, "Please upload a data file first.")], ""
-            if not api_key:
-                return chat_history + [(message, "Please enter your OpenAI API key.")], ""
-            # Query OpenAI API
-            response = query_openai(api_key, system_prompt, message)
-            return chat_history + [(message, response)], ""
-        # Connect event handlers
-        file_input.change(
-            handle_file_upload,
-            inputs=[file_input],
-            outputs=[results_json, x_axis, y_axis, error_output]
-        )
-        example_btn.click(
-            handle_example_data,
-            outputs=[results_json, x_axis, y_axis, error_output]
-        )
-        viz_type.change(
-            handle_visualization,
-            inputs=[viz_type, x_axis, y_axis, color_column],
-            outputs=[plot_output, error_output]
-        )
-        x_axis.change(
-            handle_visualization,
-            inputs=[viz_type, x_axis, y_axis, color_column],
-            outputs=[plot_output, error_output]
-        )
-        y_axis.change(
-            handle_visualization,
-            inputs=[viz_type, x_axis, y_axis, color_column],
-            outputs=[plot_output, error_output]
-        )
-        color_column.change(
-            handle_visualization,
-            inputs=[viz_type, x_axis, y_axis, color_column],
-            outputs=[plot_output, error_output]
-        )
-        submit_btn.click(
-            handle_chat_message,
-            inputs=[api_key, system_prompt, chat_input, chatbot],
-            outputs=[chatbot, chat_input]
-        )
-        return demo
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch(share=True)
-else:
-    demo = create_demo()
-    demo.launch(show_api=False)

+# app.py
+import streamlit as st
+import google.generativeai as generativeai
 import os
+import re
+import json
+import logging
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+from io import StringIO
+def load_data(uploaded_file):
+    try:
+        df = pd.read_csv(uploaded_file)
+        return df
+    except Exception as e:
+        st.error(f"Error: {str(e)}")
+        return None
+def get_numeric_columns(df):
+    return df.select_dtypes(include=['float64', 'int64']).columns
+def get_categorical_columns(df):
+    return df.select_dtypes(include=['object', 'category']).columns
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
+logger = logging.getLogger(__name__)
+def configure_gemini():
+    """Configure Google's Gemini AI model."""
     try:
+        from dotenv import load_dotenv
+        load_dotenv()
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            st.error("Please set your GOOGLE_API_KEY in the .env file")
+            return None
+        generativeai.configure(api_key=api_key)
+        return generativeai.GenerativeModel('gemini-1.0-pro')
     except Exception as e:
+        st.error(f"Error configuring Gemini: {str(e)}")
+        return None
+def get_ai_visualization_suggestion(df, user_query):
+    """Get AI-powered visualization suggestions based on the data and user query."""
+    model = configure_gemini()
+    if not model:
+        return None
+    # Create a prompt for the AI
+    columns_info = {
+        'column_names': list(df.columns),
+        'data_types': {col: str(df[col].dtype) for col in df.columns},
+        'sample_values': {col: df[col].head().tolist() for col in df.columns}
+    }
+    prompt = f"""
+    Analyze this dataset and the user's query to suggest the best visualization approach:
+    User Query: {user_query}
+    Dataset Information:
+    {json.dumps(columns_info, indent=2)}
+    Please suggest:
+    1. The most appropriate type of visualization
+    2. Which columns should be used
+    3. Any data transformations needed
+    4. Visualization parameters (like color schemes, labels, etc.)
+    Format your response as JSON with the following structure:
+    {{
+        "viz_type": "type of visualization",
+        "columns": ["column1", "column2"],
+        "transformations": ["transformation1", "transformation2"],
+        "parameters": {{
+            "param1": "value1",
+            "param2": "value2"
+        }}
+    }}
+    """
     try:
+        response = model.generate_content(prompt)
+        # Extract JSON from response
+        suggestion = json.loads(response.text)
+        return suggestion
     except Exception as e:
+        logger.error(f"Error getting AI suggestion: {str(e)}")
+        return None
+def main():
+    st.title("📊 AI-Powered Data Visualization Dashboard")
+    st.write("Upload your CSV file and explore the data through various visualizations!")
+    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+    if uploaded_file is not None:
+        df = load_data(uploaded_file)
+        if df is not None:
+            st.success("File successfully loaded!")
+            # Basic Data Info
+            st.header("📝 Data Overview")
+            st.write(f"Number of rows: {df.shape[0]}")
+            st.write(f"Number of columns: {df.shape[1]}")
+            # Data Preview
+            st.subheader("Data Preview")
+            st.dataframe(df.head())
+            # Missing Values Analysis
+            st.subheader("Missing Values Analysis")
+            missing_data = df.isnull().sum()
+            if missing_data.sum() > 0:
+                st.write("Missing values by column:")
+                st.write(missing_data[missing_data > 0])
+            else:
+                st.write("No missing values found in the dataset!")
+            # User Query for AI Suggestions
+            st.header("🤖 AI-Powered Visualization")
+            user_query = st.text_input("Describe what you want to visualize",
+                                     "Show me trends in the data")
+            if st.button("Get AI Suggestion"):
+                with st.spinner("Getting AI visualization
+            viz_type = st.selectbox(
+                "Choose visualization type",
+                ["Scatter Plot", "Line Plot", "Bar Plot", "Histogram", "Box Plot", "Correlation Heatmap"]
+            )
+            numeric_columns = get_numeric_columns(df)
+            categorical_columns = get_categorical_columns(df)
+            if viz_type == "Scatter Plot" and len(numeric_columns) >= 2:
+                x_col = st.selectbox("Select X axis", numeric_columns)
+                y_col = st.selectbox("Select Y axis", numeric_columns)
+                color_col = st.selectbox("Select Color variable (optional)",
+                                       ["None"] + list(df.columns))
+                if color_col == "None":
+                    fig = px.scatter(df, x=x_col, y=y_col)
+                else:
+                    fig = px.scatter(df, x=x_col, y=y_col, color=color_col)
+                st.plotly_chart(fig)
+            elif viz_type == "Line Plot" and len(numeric_columns) >= 1:
+                x_col = st.selectbox("Select X axis", df.columns)
+                y_col = st.selectbox("Select Y axis", numeric_columns)
+                fig = px.line(df, x=x_col, y=y_col)
+                st.plotly_chart(fig)
+            elif viz_type == "Bar Plot":
+                x_col = st.selectbox("Select X axis", df.columns)
+                y_col = st.selectbox("Select Y axis", numeric_columns)
+                fig = px.bar(df, x=x_col, y=y_col)
+                st.plotly_chart(fig)
+            elif viz_type == "Histogram" and len(numeric_columns) >= 1:
+                col = st.selectbox("Select column", numeric_columns)
+                bins = st.slider("Number of bins", min_value=5, max_value=100, value=30)
+                fig = px.histogram(df, x=col, nbins=bins)
+                st.plotly_chart(fig)
+            elif viz_type == "Box Plot" and len(numeric_columns) >= 1:
+                y_col = st.selectbox("Select column for box plot", numeric_columns)
+                x_col = st.selectbox("Select grouping variable (optional)",
+                                   ["None"] + list(categorical_columns))
+                if x_col == "None":
+                    fig = px.box(df, y=y_col)
+                else:
+                    fig = px.box(df, x=x_col, y=y_col)
+                st.plotly_chart(fig)
+            elif viz_type == "Correlation Heatmap" and len(numeric_columns) >= 2:
+                corr_matrix = df[numeric_columns].corr()
+                fig = px.imshow(corr_matrix,
+                              labels=dict(color="Correlation"),
+                              x=corr_matrix.columns,
+                              y=corr_matrix.columns)
+                st.plotly_chart(fig)
+            # Data Summary
+            st.header("📊 Data Summary")
+            if len(numeric_columns) > 0:
+                st.subheader("Numerical Columns Summary")
+                st.write(df[numeric_columns].describe())
+            if len(categorical_columns) > 0:
+                st.subheader("Categorical Columns Summary")
+                for col in categorical_columns:
+                    st.write(f"\nValue counts for {col}:")
+                    st.write(df[col].value_counts())
 if __name__ == "__main__":
+    main()