Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Sleeping

App Files Files Community

jzou19950715 commited on Jan 21

Commit

4dec0f2

verified ·

1 Parent(s): dfc517e

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -132

app.py CHANGED Viewed

@@ -5,101 +5,78 @@ import google.generativeai as genai
 import gradio as gr
 from typing import Dict, List, Any, Tuple
 import json
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class DataAnalyzer:
     def __init__(self):
         self.model = None
         self.api_key = None
         self.system_prompt = None
         self.df = None
     def configure_api(self, api_key: str):
-                try:
-            response = self.model.generate_content(prompt)
-            return response.text
-        except Exception as e:
-            logger.error(f"Analysis failed: {str(e)}")
-            return f"Analysis failed: {str(e)}"
-def create_interface():
-    analyzer = DataAnalyzer()
-    def process_inputs(api_key: str, system_prompt: str, file, query: str):
-        """Process user inputs and return analysis results"""
-        # Configure API
-        if api_key != analyzer.api_key:
-            if not analyzer.configure_api(api_key):
-                return "Failed to configure API. Please check your API key."
-        # Update system prompt
-        analyzer.system_prompt = system_prompt
-        # Load data if new file provided
-        if file is not None:
-            success, message = analyzer.load_data(file)
-            if not success:
-                return message
-        # Run analysis
-        return analyzer.analyze(query)
-    # Create Gradio interface
-    with gr.Blocks(title="Data Analysis Assistant") as interface:
-        gr.Markdown("# Data Analysis Assistant")
-        gr.Markdown("Upload your CSV file and get AI-powered analysis")
-        with gr.Row():
-            api_key_input = gr.Textbox(
-                label="Gemini API Key",
-                placeholder="Enter your Gemini API key",
-                type="password"
-            )
-        with gr.Row():
-            system_prompt_input = gr.Textbox(
-                label="System Prompt",
-                placeholder="Enter system prompt for the AI",
-                value="You are a data analysis expert. Analyze the provided data and answer the user's query.",
-                lines=3
-            )
-        with gr.Row():
-            file_input = gr.File(
-                label="Upload CSV",
-                file_types=[".csv"]
-            )
-        with gr.Row():
-            query_input = gr.Textbox(
-                label="Analysis Query",
-                placeholder="What would you like to know about the data?",
-                lines=2
-            )
-        with gr.Row():
-            submit_btn = gr.Button("Analyze")
-        with gr.Row():
-            output = gr.Markdown(label="Analysis Results")
-        submit_btn.click(
-            fn=process_inputs,
-            inputs=[api_key_input, system_prompt_input, file_input, query_input],
-            outputs=output
-        )
-    return interface
-def main():
-    interface = create_interface()
-    interface.launch()
-if __name__ == "__main__":
-    main()Configure the Gemini API with the provided key"""
         try:
             genai.configure(api_key=api_key)
             self.model = genai.GenerativeModel('gemini-1.5-pro')
@@ -113,6 +90,7 @@ if __name__ == "__main__":
         """Load data from uploaded CSV file"""
         try:
             self.df = pd.read_csv(file.name)
             return True, f"Loaded CSV with {len(self.df)} rows and {len(self.df.columns)} columns"
         except Exception as e:
             logger.error(f"Data loading failed: {str(e)}")
@@ -131,16 +109,16 @@ if __name__ == "__main__":
         }
         return info
-    def analyze(self, query: str) -> str:
-        """Analyze data based on user query"""
         if self.model is None:
-            return "Please configure API key first"
         if self.df is None:
-            return "Please upload a CSV file first"
         data_info = self.get_data_info()
-        # Combine system prompt with data context
         prompt = f"""{self.system_prompt}
 Data Information:
@@ -148,52 +126,89 @@ Data Information:
 - Number of rows: {data_info['rows']}
 - Sample data: {json.dumps(data_info['sample'], indent=2)}
-User Query: {query}
-Please analyze this data and provide:
-1. A clear explanation of your findings
-2. Key statistics relevant to the query
-3. If appropriate, suggest visualizations that would help understand the data better
-Response Format:
-1. First give a direct answer to the query
-2. Then provide supporting statistics
-3. Finally, suggest any relevant additional insights
-Remember to handle:
-- Missing or null values
-- Outliers
-- Data type conversions if needed
-- Basic error checking
 """
         try:
-            # Call Gemini API
             response = self.model.generate_content(prompt)
-            # Extract and format the response
-            if response.text:
-                formatted_response = (
-                    "## Analysis Results\n\n"
-                    f"{response.text}\n\n"
-                    "---\n"
-                    "Note: This analysis was generated using the provided data. "
-                    "Please verify any critical insights independently."
-                )
-                return formatted_response
-            else:
-                return "No analysis could be generated. Please try a different query."
         except Exception as e:
             logger.error(f"Analysis failed: {str(e)}")
-            error_message = (
-                "## Error During Analysis\n\n"
-                f"The analysis failed with error: {str(e)}\n\n"
-                "Please try:\n"
-                "1. Checking your API key\n"
-                "2. Simplifying your query\n"
-                "3. Ensuring your data is properly formatted"
-            )
-            return error_message
 def create_interface():
     """Create the Gradio interface"""
@@ -201,27 +216,23 @@ def create_interface():
     def process_inputs(api_key: str, system_prompt: str, file, query: str):
         """Process user inputs and return analysis results"""
-        # Configure API
         if api_key != analyzer.api_key:
             if not analyzer.configure_api(api_key):
                 return "Failed to configure API. Please check your API key."
-        # Update system prompt
         analyzer.system_prompt = system_prompt
-        # Load data if new file provided
         if file is not None:
             success, message = analyzer.load_data(file)
             if not success:
                 return message
-        # Run analysis
         return analyzer.analyze(query)
     # Create Gradio interface
-    with gr.Blocks(title="Data Analysis Assistant") as interface:
-        gr.Markdown("# Data Analysis Assistant")
-        gr.Markdown("Upload your CSV file and get AI-powered analysis")
         with gr.Row():
             api_key_input = gr.Textbox(
@@ -234,8 +245,13 @@ def create_interface():
             system_prompt_input = gr.Textbox(
                 label="System Prompt",
                 placeholder="Enter system prompt for the AI",
-                value="You are a data analysis expert. Analyze the provided data and answer the user's query.",
-                lines=3
             )
         with gr.Row():

 import gradio as gr
 from typing import Dict, List, Any, Tuple
 import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+import base64
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class DataTools:
+    """Tools for data analysis that can be called by the AI"""
+    def __init__(self, df: pd.DataFrame):
+        self.df = df
+    def describe_column(self, column: str) -> dict:
+        """Get statistical description of a column"""
+        if column not in self.df.columns:
+            return {"error": f"Column {column} not found"}
+        stats = self.df[column].describe().to_dict()
+        null_count = self.df[column].isnull().sum()
+        return {
+            "statistics": stats,
+            "null_count": int(null_count),
+            "dtype": str(self.df[column].dtype)
+        }
+    def create_visualization(self, plot_type: str, x: str, y: str = None, title: str = None) -> str:
+        """Create a visualization and return as base64 string"""
+        try:
+            plt.figure(figsize=(10, 6))
+            if plot_type == "histogram":
+                sns.histplot(data=self.df, x=x)
+            elif plot_type == "scatter":
+                sns.scatterplot(data=self.df, x=x, y=y)
+            elif plot_type == "boxplot":
+                sns.boxplot(data=self.df, x=x, y=y)
+            elif plot_type == "bar":
+                sns.barplot(data=self.df, x=x, y=y)
+            if title:
+                plt.title(title)
+            # Save plot to bytes buffer
+            buf = io.BytesIO()
+            plt.savefig(buf, format='png')
+            buf.seek(0)
+            plt.close()
+            # Convert to base64
+            return base64.b64encode(buf.read()).decode('utf-8')
+        except Exception as e:
+            return f"Error creating visualization: {str(e)}"
+    def get_correlation(self, columns: List[str]) -> dict:
+        """Get correlation between specified columns"""
+        try:
+            corr = self.df[columns].corr().to_dict()
+            return {"correlation_matrix": corr}
+        except Exception as e:
+            return {"error": f"Error calculating correlation: {str(e)}"}
 class DataAnalyzer:
     def __init__(self):
         self.model = None
         self.api_key = None
         self.system_prompt = None
         self.df = None
+        self.tools = None
     def configure_api(self, api_key: str):
+        """Configure the Gemini API with the provided key"""
         try:
             genai.configure(api_key=api_key)
             self.model = genai.GenerativeModel('gemini-1.5-pro')
         """Load data from uploaded CSV file"""
         try:
             self.df = pd.read_csv(file.name)
+            self.tools = DataTools(self.df)
             return True, f"Loaded CSV with {len(self.df)} rows and {len(self.df.columns)} columns"
         except Exception as e:
             logger.error(f"Data loading failed: {str(e)}")
         }
         return info
+    def analyze(self, query: str) -> Dict[str, Any]:
+        """Analyze data based on user query with structured output"""
         if self.model is None:
+            return {"error": "Please configure API key first"}
         if self.df is None:
+            return {"error": "Please upload a CSV file first"}
         data_info = self.get_data_info()
+        # Combine system prompt with data context and tool instructions
         prompt = f"""{self.system_prompt}
 Data Information:
 - Number of rows: {data_info['rows']}
 - Sample data: {json.dumps(data_info['sample'], indent=2)}
+Available Tools:
+1. describe_column(column: str) - Get statistical description of a column
+2. create_visualization(plot_type: str, x: str, y: str = None, title: str = None)
+   - Create visualizations (types: histogram, scatter, boxplot, bar)
+3. get_correlation(columns: List[str]) - Get correlation between columns
+User Query: {query}
+Please provide a structured analysis in the following JSON format:
+{
+    "answer": "Direct answer to the query",
+    "tools_used": [
+        {
+            "tool": "tool_name",
+            "parameters": {"param1": "value1"},
+            "purpose": "Why this tool was used"
+        }
+    ],
+    "insights": ["List of key insights"],
+    "visualizations": ["List of suggested visualizations"],
+    "recommendations": ["List of recommendations"],
+    "limitations": ["Any limitations in the analysis"]
+}
+Important:
+- Be specific about which tools to use
+- Provide clear reasoning for each tool choice
+- Structure the output exactly as shown above
 """
         try:
+            # Get initial response from Gemini
             response = self.model.generate_content(prompt)
+            response_text = response.text
+            try:
+                # Parse the response as JSON
+                structured_response = json.loads(response_text)
+                # Execute tool calls based on response
+                results = {"response": structured_response, "tool_outputs": []}
+                for tool_call in structured_response.get("tools_used", []):
+                    tool_name = tool_call["tool"]
+                    parameters = tool_call["parameters"]
+                    if hasattr(self.tools, tool_name):
+                        tool_method = getattr(self.tools, tool_name)
+                        tool_result = tool_method(**parameters)
+                        results["tool_outputs"].append({
+                            "tool": tool_name,
+                            "parameters": parameters,
+                            "result": tool_result
+                        })
+                # Format output for Gradio
+                formatted_output = f"""## Analysis Results
+{structured_response['answer']}
+### Key Insights
+{"".join(['- ' + insight + '\\n' for insight in structured_response['insights']])}
+### Visualizations
+{"".join(['- ' + viz + '\\n' for viz in structured_response['visualizations']])}
+### Recommendations
+{"".join(['- ' + rec + '\\n' for rec in structured_response['recommendations']])}
+### Limitations
+{"".join(['- ' + lim + '\\n' for lim in structured_response['limitations']])}
+---
+Tool Outputs:
+{"".join([f'\\n**{out["tool"]}**:\\n```json\\n{json.dumps(out["result"], indent=2)}\\n```' for out in results['tool_outputs']])}
+"""
+                return formatted_output
+            except json.JSONDecodeError:
+                return f"Error: Could not parse structured response\\n\\nRaw response:\\n{response_text}"
         except Exception as e:
             logger.error(f"Analysis failed: {str(e)}")
+            return f"Error during analysis: {str(e)}"
 def create_interface():
     """Create the Gradio interface"""
     def process_inputs(api_key: str, system_prompt: str, file, query: str):
         """Process user inputs and return analysis results"""
         if api_key != analyzer.api_key:
             if not analyzer.configure_api(api_key):
                 return "Failed to configure API. Please check your API key."
         analyzer.system_prompt = system_prompt
         if file is not None:
             success, message = analyzer.load_data(file)
             if not success:
                 return message
         return analyzer.analyze(query)
     # Create Gradio interface
+    with gr.Blocks(title="Advanced Data Analysis Assistant") as interface:
+        gr.Markdown("# Advanced Data Analysis Assistant")
+        gr.Markdown("Upload your CSV file and get AI-powered analysis with visualizations")
         with gr.Row():
             api_key_input = gr.Textbox(
             system_prompt_input = gr.Textbox(
                 label="System Prompt",
                 placeholder="Enter system prompt for the AI",
+                value="""You are an advanced data analysis expert. Analyze the provided data and answer the query.
+Focus on:
+1. Clear, structured analysis
+2. Statistical insights
+3. Appropriate visualizations
+4. Actionable recommendations""",
+                lines=4
             )
         with gr.Row():