Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Running

App Files Files Community

jzou19950715 commited on Jan 17

Commit

bcd9ccf

verified ·

1 Parent(s): 6a8cba0

Create app.py

Browse files

Files changed (1) hide show

app.py +329 -0

app.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import os
+import sys
+import subprocess
+import requests
+import gradio as gr
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import LabelEncoder
+# --------------------------------------------------------------------------------
+# OPTIONAL: dynamic installation for rarely used packages not in requirements.txt
+# --------------------------------------------------------------------------------
+def install_library(library):
+    """
+    Install a library using pip.
+    Useful for rarely used packages NOT in requirements.txt.
+    """
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", library])
+        return f"Successfully installed {library}."
+    except Exception as e:
+        return f"Error installing {library}: {str(e)}"
+def dynamic_import(library, alias=None):
+    """
+    Dynamically import a library. If not found, try to install it, then import again.
+    """
+    try:
+        if alias:
+            globals()[alias] = __import__(library)
+        else:
+            globals()[library] = __import__(library)
+    except ImportError:
+        install_msg = install_library(library)
+        print(install_msg)
+        globals()[library] = __import__(library)
+# --------------------------------------------------------------------------------
+# LLM CALLS: GPT-4o-mini, OpenAI, DeepSeek, Gemini
+# --------------------------------------------------------------------------------
+import openai
+from huggingface_hub import InferenceClient
+def call_gpt4o_mini(api_key, user_prompt):
+    """
+    Calls a GPT-4o-mini model hosted on Hugging Face.
+    Replace 'someUser/gpt-4o-mini' with your actual model repo.
+    """
+    if not api_key:
+        return "No Hugging Face API key provided. Cannot call GPT-4o-mini."
+    try:
+        client = InferenceClient(
+            repo_id="someUser/gpt-4o-mini",  # <--- Replace with your real GPT-4o-mini repo
+            token=api_key
+        )
+        # We use text_generation endpoint; adapt if your model differs
+        response = client.text_generation(user_prompt, max_new_tokens=128)
+        # 'response' can be a string or dict depending on the endpoint. Assume it's a string:
+        return response
+    except Exception as e:
+        return f"Error calling GPT-4o-mini: {str(e)}"
+def call_openai(api_key, user_prompt):
+    """Calls OpenAI's API (example usage)."""
+    openai.api_key = api_key
+    try:
+        response = openai.Completion.create(
+            model="text-davinci-003",
+            prompt=user_prompt,
+            max_tokens=128
+        )
+        return response["choices"][0]["text"].strip()
+    except Exception as e:
+        return f"OpenAI Error: {str(e)}"
+def call_deepseek(api_key, user_prompt):
+    """
+    Hypothetical function to call a DeepSeek API endpoint.
+    Replace with real DeepSeek logic as needed.
+    """
+    try:
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}"
+        }
+        payload = {
+            "prompt": user_prompt,
+            "max_tokens": 128
+        }
+        # Example POST; adapt to the real DeepSeek endpoint
+        response = requests.post(
+            "https://api.deepseek.ai/v1/chat",
+            json=payload,
+            headers=headers
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["text"].strip()
+    except Exception as e:
+        return f"DeepSeek Error: {str(e)}"
+def call_gemini(api_key, user_prompt):
+    """
+    Hypothetical function for Gemini LLM.
+    Replace with real Gemini logic.
+    """
+    return "(Gemini usage not yet implemented; placeholder)"
+def call_llm(api_provider, api_key, user_prompt):
+    """Routes calls to the correct LLM provider."""
+    if not api_key:
+        return "No API key provided. Using GPT-4o-mini default is not possible without HF key." if api_provider.lower() == "gpt-4o-mini" else "No API key provided."
+    provider_lower = api_provider.lower()
+    if provider_lower == "gpt-4o-mini":
+        return call_gpt4o_mini(api_key, user_prompt)
+    elif provider_lower == "openai":
+        return call_openai(api_key, user_prompt)
+    elif provider_lower == "deepseek":
+        return call_deepseek(api_key, user_prompt)
+    elif provider_lower == "gemini":
+        return call_gemini(api_key, user_prompt)
+    else:
+        return f"Unknown provider: {api_provider}. Please choose GPT-4o-mini, OpenAI, DeepSeek, or Gemini."
+# --------------------------------------------------------------------------------
+# ADVANCED DATA ANALYSIS (extended_analysis)
+# --------------------------------------------------------------------------------
+def extended_analysis(df):
+    """
+    Sample advanced analysis:
+      1. Correlation heatmap for numeric columns
+      2. Bar plot of 'Career' (if present)
+      3. Simple logistic regression classification if 'Career' is suitable
+    """
+    output_paths = []
+    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
+    # 1) Correlation Heatmap
+    if len(numeric_cols) > 1:
+        corr = df[numeric_cols].corr()
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
+        plt.title("Correlation Heatmap")
+        heatmap_path = "heatmap.png"
+        plt.savefig(heatmap_path)
+        plt.close()
+        output_paths.append(heatmap_path)
+    # 2) Bar Plot of 'Career' if present
+    if "Career" in df.columns:
+        plt.figure(figsize=(8, 5))
+        df["Career"].value_counts().plot(kind="bar")
+        plt.title("Count of Each Career")
+        plt.xlabel("Career")
+        plt.ylabel("Count")
+        barplot_path = "barplot_career.png"
+        plt.savefig(barplot_path)
+        plt.close()
+        output_paths.append(barplot_path)
+    # 3) Simple Logistic Regression if 'Career' exists with multiple categories
+    if "Career" in df.columns and len(numeric_cols) > 0:
+        le = LabelEncoder()
+        df["Career_encoded"] = le.fit_transform(df["Career"])
+        X = df[numeric_cols].fillna(0)
+        y = df["Career_encoded"]
+        if len(np.unique(y)) > 1:
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            model = LogisticRegression(max_iter=1000)
+            model.fit(X_train, y_train)
+            score = model.score(X_test, y_test)
+            accuracy_info = f"Logistic Regression accuracy on test set: {score:.2f}"
+        else:
+            accuracy_info = "Career column has only one class; no classification performed."
+    else:
+        accuracy_info = "No 'Career' column or insufficient numeric data for classification."
+    return output_paths, accuracy_info
+# --------------------------------------------------------------------------------
+# MAIN ANALYSIS AND VISUALIZATION FUNCTION
+# --------------------------------------------------------------------------------
+def analyze_and_visualize(
+    file,
+    message,
+    history,
+    api_provider,
+    api_key
+):
+    """
+    Loads CSV, gives a summary, calls LLM for suggestions if an API key is provided,
+    does extended analysis if user requests ("sample analysis", "extended analysis", etc.),
+    and returns results/plots in the chatbot.
+    """
+    try:
+        # Load CSV
+        df = pd.read_csv(file.name)
+        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+        categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
+        # Basic info
+        summary = (
+            f"**File**: {file.name}\n"
+            f"**Shape**: {df.shape[0]} rows, {df.shape[1]} columns\n"
+            f"**Numerical Columns**: {', '.join(numeric_cols) if numeric_cols else 'None'}\n"
+            f"**Categorical Columns**: {', '.join(categorical_cols) if categorical_cols else 'None'}\n"
+        )
+        # LLM suggestions
+        llm_suggestions = ""
+        if api_key:
+            user_prompt = (
+                f"Data Summary:\n{summary}\n\n"
+                f"User question or request: {message}\n"
+                f"Suggest advanced data analysis or steps if relevant."
+            )
+            llm_response = call_llm(api_provider, api_key, user_prompt)
+            llm_suggestions = f"\n**LLM Suggestions**:\n{llm_response}\n"
+        else:
+            llm_suggestions = "\n(No LLM suggestions because no API key provided.)\n"
+        # Always produce example histogram if there's at least one numeric column
+        hist_path = None
+        if numeric_cols:
+            plt.figure(figsize=(6, 4))
+            sns.histplot(df[numeric_cols[0]], kde=True)
+            plt.title(f"Distribution of '{numeric_cols[0]}'")
+            plt.tight_layout()
+            hist_path = "temp_plot.png"
+            plt.savefig(hist_path)
+            plt.close()
+        # Check if the user wants extended analysis
+        trigger_phrases = ["sample analysis", "extended analysis", "advanced analysis", "run analysis"]
+        analysis_paths = []
+        accuracy_info = ""
+        if any(phrase in message.lower() for phrase in trigger_phrases):
+            analysis_paths, accuracy_info = extended_analysis(df)
+        # Build final response text
+        response_text = summary + llm_suggestions
+        if accuracy_info:
+            response_text += f"\n**ML Model Info**: {accuracy_info}\n"
+        # Construct the final chatbot content
+        chat_content = [(message, response_text)]
+        if hist_path:
+            chat_content.append((None, (hist_path,)))
+        for path in analysis_paths:
+            chat_content.append((None, (path,)))
+        return history + chat_content
+    except Exception as e:
+        return history + [(message, f"Error: {str(e)}")]
+# --------------------------------------------------------------------------------
+# CREATING THE GRADIO APP
+# --------------------------------------------------------------------------------
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 🤖 GPT-4o-mini (Default) + Multi-Provider AI Data Analysis Assistant")
+        gr.Markdown(
+            """
+            **Features**:
+            - Default LLM: GPT-4o-mini on Hugging Face (requires HF API key).
+            - Other providers: **OpenAI**, **DeepSeek**, **Gemini** (enter their respective API keys).
+            - Upload CSV for data summary & histograms.
+            - Type "sample analysis" or "extended analysis" to trigger correlation heatmaps, bar plots, and a simple logistic regression.
+            """
+        )
+        with gr.Row():
+            api_provider = gr.Dropdown(
+                choices=["GPT-4o-mini", "OpenAI", "DeepSeek", "Gemini"],
+                value="GPT-4o-mini",  # default
+                label="LLM Provider",
+            )
+            api_key = gr.Textbox(
+                label="LLM API Key",
+                placeholder="Enter your Hugging Face/DeepSeek/OpenAI/Gemini API key here..."
+            )
+        file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
+        chatbot = gr.Chatbot(label="Analysis Output")
+        msg = gr.Textbox(
+            label="Message",
+            placeholder="Ask the AI or type 'sample analysis' for extended analysis..."
+        )
+        send_btn = gr.Button("Send")
+        reset_btn = gr.Button("Reset Chat")
+        def reset_chat():
+            return []
+        msg.submit(
+            fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
+            inputs=[file_input, msg, chatbot, api_provider, api_key],
+            outputs=[chatbot]
+        ).then(lambda: "", None, [msg])
+        send_btn.click(
+            fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
+            inputs=[file_input, msg, chatbot, api_provider, api_key],
+            outputs=[chatbot]
+        ).then(lambda: "", None, [msg])
+        reset_btn.click(fn=reset_chat, inputs=[], outputs=[chatbot])
+        demo.queue()
+        return demo
+demo = create_demo()
+if __name__ == "__main__":
+    demo.launch(share=True)