jzou19950715's picture
Create app.py
bcd9ccf verified
raw
history blame
12.2 kB
import os
import sys
import subprocess
import requests
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
# --------------------------------------------------------------------------------
# OPTIONAL: dynamic installation for rarely used packages not in requirements.txt
# --------------------------------------------------------------------------------
def install_library(library):
"""
Install a library using pip.
Useful for rarely used packages NOT in requirements.txt.
"""
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", library])
return f"Successfully installed {library}."
except Exception as e:
return f"Error installing {library}: {str(e)}"
def dynamic_import(library, alias=None):
"""
Dynamically import a library. If not found, try to install it, then import again.
"""
try:
if alias:
globals()[alias] = __import__(library)
else:
globals()[library] = __import__(library)
except ImportError:
install_msg = install_library(library)
print(install_msg)
globals()[library] = __import__(library)
# --------------------------------------------------------------------------------
# LLM CALLS: GPT-4o-mini, OpenAI, DeepSeek, Gemini
# --------------------------------------------------------------------------------
import openai
from huggingface_hub import InferenceClient
def call_gpt4o_mini(api_key, user_prompt):
"""
Calls a GPT-4o-mini model hosted on Hugging Face.
Replace 'someUser/gpt-4o-mini' with your actual model repo.
"""
if not api_key:
return "No Hugging Face API key provided. Cannot call GPT-4o-mini."
try:
client = InferenceClient(
repo_id="someUser/gpt-4o-mini", # <--- Replace with your real GPT-4o-mini repo
token=api_key
)
# We use text_generation endpoint; adapt if your model differs
response = client.text_generation(user_prompt, max_new_tokens=128)
# 'response' can be a string or dict depending on the endpoint. Assume it's a string:
return response
except Exception as e:
return f"Error calling GPT-4o-mini: {str(e)}"
def call_openai(api_key, user_prompt):
"""Calls OpenAI's API (example usage)."""
openai.api_key = api_key
try:
response = openai.Completion.create(
model="text-davinci-003",
prompt=user_prompt,
max_tokens=128
)
return response["choices"][0]["text"].strip()
except Exception as e:
return f"OpenAI Error: {str(e)}"
def call_deepseek(api_key, user_prompt):
"""
Hypothetical function to call a DeepSeek API endpoint.
Replace with real DeepSeek logic as needed.
"""
try:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"prompt": user_prompt,
"max_tokens": 128
}
# Example POST; adapt to the real DeepSeek endpoint
response = requests.post(
"https://api.deepseek.ai/v1/chat",
json=payload,
headers=headers
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["text"].strip()
except Exception as e:
return f"DeepSeek Error: {str(e)}"
def call_gemini(api_key, user_prompt):
"""
Hypothetical function for Gemini LLM.
Replace with real Gemini logic.
"""
return "(Gemini usage not yet implemented; placeholder)"
def call_llm(api_provider, api_key, user_prompt):
"""Routes calls to the correct LLM provider."""
if not api_key:
return "No API key provided. Using GPT-4o-mini default is not possible without HF key." if api_provider.lower() == "gpt-4o-mini" else "No API key provided."
provider_lower = api_provider.lower()
if provider_lower == "gpt-4o-mini":
return call_gpt4o_mini(api_key, user_prompt)
elif provider_lower == "openai":
return call_openai(api_key, user_prompt)
elif provider_lower == "deepseek":
return call_deepseek(api_key, user_prompt)
elif provider_lower == "gemini":
return call_gemini(api_key, user_prompt)
else:
return f"Unknown provider: {api_provider}. Please choose GPT-4o-mini, OpenAI, DeepSeek, or Gemini."
# --------------------------------------------------------------------------------
# ADVANCED DATA ANALYSIS (extended_analysis)
# --------------------------------------------------------------------------------
def extended_analysis(df):
"""
Sample advanced analysis:
1. Correlation heatmap for numeric columns
2. Bar plot of 'Career' (if present)
3. Simple logistic regression classification if 'Career' is suitable
"""
output_paths = []
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
# 1) Correlation Heatmap
if len(numeric_cols) > 1:
corr = df[numeric_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
heatmap_path = "heatmap.png"
plt.savefig(heatmap_path)
plt.close()
output_paths.append(heatmap_path)
# 2) Bar Plot of 'Career' if present
if "Career" in df.columns:
plt.figure(figsize=(8, 5))
df["Career"].value_counts().plot(kind="bar")
plt.title("Count of Each Career")
plt.xlabel("Career")
plt.ylabel("Count")
barplot_path = "barplot_career.png"
plt.savefig(barplot_path)
plt.close()
output_paths.append(barplot_path)
# 3) Simple Logistic Regression if 'Career' exists with multiple categories
if "Career" in df.columns and len(numeric_cols) > 0:
le = LabelEncoder()
df["Career_encoded"] = le.fit_transform(df["Career"])
X = df[numeric_cols].fillna(0)
y = df["Career_encoded"]
if len(np.unique(y)) > 1:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
accuracy_info = f"Logistic Regression accuracy on test set: {score:.2f}"
else:
accuracy_info = "Career column has only one class; no classification performed."
else:
accuracy_info = "No 'Career' column or insufficient numeric data for classification."
return output_paths, accuracy_info
# --------------------------------------------------------------------------------
# MAIN ANALYSIS AND VISUALIZATION FUNCTION
# --------------------------------------------------------------------------------
def analyze_and_visualize(
file,
message,
history,
api_provider,
api_key
):
"""
Loads CSV, gives a summary, calls LLM for suggestions if an API key is provided,
does extended analysis if user requests ("sample analysis", "extended analysis", etc.),
and returns results/plots in the chatbot.
"""
try:
# Load CSV
df = pd.read_csv(file.name)
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
# Basic info
summary = (
f"**File**: {file.name}\n"
f"**Shape**: {df.shape[0]} rows, {df.shape[1]} columns\n"
f"**Numerical Columns**: {', '.join(numeric_cols) if numeric_cols else 'None'}\n"
f"**Categorical Columns**: {', '.join(categorical_cols) if categorical_cols else 'None'}\n"
)
# LLM suggestions
llm_suggestions = ""
if api_key:
user_prompt = (
f"Data Summary:\n{summary}\n\n"
f"User question or request: {message}\n"
f"Suggest advanced data analysis or steps if relevant."
)
llm_response = call_llm(api_provider, api_key, user_prompt)
llm_suggestions = f"\n**LLM Suggestions**:\n{llm_response}\n"
else:
llm_suggestions = "\n(No LLM suggestions because no API key provided.)\n"
# Always produce example histogram if there's at least one numeric column
hist_path = None
if numeric_cols:
plt.figure(figsize=(6, 4))
sns.histplot(df[numeric_cols[0]], kde=True)
plt.title(f"Distribution of '{numeric_cols[0]}'")
plt.tight_layout()
hist_path = "temp_plot.png"
plt.savefig(hist_path)
plt.close()
# Check if the user wants extended analysis
trigger_phrases = ["sample analysis", "extended analysis", "advanced analysis", "run analysis"]
analysis_paths = []
accuracy_info = ""
if any(phrase in message.lower() for phrase in trigger_phrases):
analysis_paths, accuracy_info = extended_analysis(df)
# Build final response text
response_text = summary + llm_suggestions
if accuracy_info:
response_text += f"\n**ML Model Info**: {accuracy_info}\n"
# Construct the final chatbot content
chat_content = [(message, response_text)]
if hist_path:
chat_content.append((None, (hist_path,)))
for path in analysis_paths:
chat_content.append((None, (path,)))
return history + chat_content
except Exception as e:
return history + [(message, f"Error: {str(e)}")]
# --------------------------------------------------------------------------------
# CREATING THE GRADIO APP
# --------------------------------------------------------------------------------
def create_demo():
with gr.Blocks() as demo:
gr.Markdown("# 🤖 GPT-4o-mini (Default) + Multi-Provider AI Data Analysis Assistant")
gr.Markdown(
"""
**Features**:
- Default LLM: GPT-4o-mini on Hugging Face (requires HF API key).
- Other providers: **OpenAI**, **DeepSeek**, **Gemini** (enter their respective API keys).
- Upload CSV for data summary & histograms.
- Type "sample analysis" or "extended analysis" to trigger correlation heatmaps, bar plots, and a simple logistic regression.
"""
)
with gr.Row():
api_provider = gr.Dropdown(
choices=["GPT-4o-mini", "OpenAI", "DeepSeek", "Gemini"],
value="GPT-4o-mini", # default
label="LLM Provider",
)
api_key = gr.Textbox(
label="LLM API Key",
placeholder="Enter your Hugging Face/DeepSeek/OpenAI/Gemini API key here..."
)
file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
chatbot = gr.Chatbot(label="Analysis Output")
msg = gr.Textbox(
label="Message",
placeholder="Ask the AI or type 'sample analysis' for extended analysis..."
)
send_btn = gr.Button("Send")
reset_btn = gr.Button("Reset Chat")
def reset_chat():
return []
msg.submit(
fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
inputs=[file_input, msg, chatbot, api_provider, api_key],
outputs=[chatbot]
).then(lambda: "", None, [msg])
send_btn.click(
fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
inputs=[file_input, msg, chatbot, api_provider, api_key],
outputs=[chatbot]
).then(lambda: "", None, [msg])
reset_btn.click(fn=reset_chat, inputs=[], outputs=[chatbot])
demo.queue()
return demo
demo = create_demo()
if __name__ == "__main__":
demo.launch(share=True)