|
import os |
|
import sys |
|
import subprocess |
|
import requests |
|
import gradio as gr |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import numpy as np |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
|
|
|
|
|
|
def install_library(library): |
|
""" |
|
Install a library using pip. |
|
Useful for rarely used packages NOT in requirements.txt. |
|
""" |
|
try: |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", library]) |
|
return f"Successfully installed {library}." |
|
except Exception as e: |
|
return f"Error installing {library}: {str(e)}" |
|
|
|
def dynamic_import(library, alias=None): |
|
""" |
|
Dynamically import a library. If not found, try to install it, then import again. |
|
""" |
|
try: |
|
if alias: |
|
globals()[alias] = __import__(library) |
|
else: |
|
globals()[library] = __import__(library) |
|
except ImportError: |
|
install_msg = install_library(library) |
|
print(install_msg) |
|
globals()[library] = __import__(library) |
|
|
|
|
|
|
|
|
|
|
|
import openai |
|
from huggingface_hub import InferenceClient |
|
|
|
def call_gpt4o_mini(api_key, user_prompt): |
|
""" |
|
Calls a GPT-4o-mini model hosted on Hugging Face. |
|
Replace 'someUser/gpt-4o-mini' with your actual model repo. |
|
""" |
|
if not api_key: |
|
return "No Hugging Face API key provided. Cannot call GPT-4o-mini." |
|
|
|
try: |
|
client = InferenceClient( |
|
repo_id="someUser/gpt-4o-mini", |
|
token=api_key |
|
) |
|
|
|
response = client.text_generation(user_prompt, max_new_tokens=128) |
|
|
|
return response |
|
except Exception as e: |
|
return f"Error calling GPT-4o-mini: {str(e)}" |
|
|
|
def call_openai(api_key, user_prompt): |
|
"""Calls OpenAI's API (example usage).""" |
|
openai.api_key = api_key |
|
try: |
|
response = openai.Completion.create( |
|
model="text-davinci-003", |
|
prompt=user_prompt, |
|
max_tokens=128 |
|
) |
|
return response["choices"][0]["text"].strip() |
|
except Exception as e: |
|
return f"OpenAI Error: {str(e)}" |
|
|
|
def call_deepseek(api_key, user_prompt): |
|
""" |
|
Hypothetical function to call a DeepSeek API endpoint. |
|
Replace with real DeepSeek logic as needed. |
|
""" |
|
try: |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {api_key}" |
|
} |
|
payload = { |
|
"prompt": user_prompt, |
|
"max_tokens": 128 |
|
} |
|
|
|
response = requests.post( |
|
"https://api.deepseek.ai/v1/chat", |
|
json=payload, |
|
headers=headers |
|
) |
|
response.raise_for_status() |
|
data = response.json() |
|
return data["choices"][0]["text"].strip() |
|
except Exception as e: |
|
return f"DeepSeek Error: {str(e)}" |
|
|
|
def call_gemini(api_key, user_prompt): |
|
""" |
|
Hypothetical function for Gemini LLM. |
|
Replace with real Gemini logic. |
|
""" |
|
return "(Gemini usage not yet implemented; placeholder)" |
|
|
|
def call_llm(api_provider, api_key, user_prompt): |
|
"""Routes calls to the correct LLM provider.""" |
|
if not api_key: |
|
return "No API key provided. Using GPT-4o-mini default is not possible without HF key." if api_provider.lower() == "gpt-4o-mini" else "No API key provided." |
|
|
|
provider_lower = api_provider.lower() |
|
if provider_lower == "gpt-4o-mini": |
|
return call_gpt4o_mini(api_key, user_prompt) |
|
elif provider_lower == "openai": |
|
return call_openai(api_key, user_prompt) |
|
elif provider_lower == "deepseek": |
|
return call_deepseek(api_key, user_prompt) |
|
elif provider_lower == "gemini": |
|
return call_gemini(api_key, user_prompt) |
|
else: |
|
return f"Unknown provider: {api_provider}. Please choose GPT-4o-mini, OpenAI, DeepSeek, or Gemini." |
|
|
|
|
|
|
|
|
|
def extended_analysis(df): |
|
""" |
|
Sample advanced analysis: |
|
1. Correlation heatmap for numeric columns |
|
2. Bar plot of 'Career' (if present) |
|
3. Simple logistic regression classification if 'Career' is suitable |
|
""" |
|
output_paths = [] |
|
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() |
|
|
|
cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist() |
|
|
|
|
|
if len(numeric_cols) > 1: |
|
corr = df[numeric_cols].corr() |
|
plt.figure(figsize=(8, 6)) |
|
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") |
|
plt.title("Correlation Heatmap") |
|
heatmap_path = "heatmap.png" |
|
plt.savefig(heatmap_path) |
|
plt.close() |
|
output_paths.append(heatmap_path) |
|
|
|
|
|
if "Career" in df.columns: |
|
plt.figure(figsize=(8, 5)) |
|
df["Career"].value_counts().plot(kind="bar") |
|
plt.title("Count of Each Career") |
|
plt.xlabel("Career") |
|
plt.ylabel("Count") |
|
barplot_path = "barplot_career.png" |
|
plt.savefig(barplot_path) |
|
plt.close() |
|
output_paths.append(barplot_path) |
|
|
|
|
|
if "Career" in df.columns and len(numeric_cols) > 0: |
|
le = LabelEncoder() |
|
df["Career_encoded"] = le.fit_transform(df["Career"]) |
|
X = df[numeric_cols].fillna(0) |
|
y = df["Career_encoded"] |
|
if len(np.unique(y)) > 1: |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
model = LogisticRegression(max_iter=1000) |
|
model.fit(X_train, y_train) |
|
score = model.score(X_test, y_test) |
|
accuracy_info = f"Logistic Regression accuracy on test set: {score:.2f}" |
|
else: |
|
accuracy_info = "Career column has only one class; no classification performed." |
|
else: |
|
accuracy_info = "No 'Career' column or insufficient numeric data for classification." |
|
|
|
return output_paths, accuracy_info |
|
|
|
|
|
|
|
|
|
def analyze_and_visualize( |
|
file, |
|
message, |
|
history, |
|
api_provider, |
|
api_key |
|
): |
|
""" |
|
Loads CSV, gives a summary, calls LLM for suggestions if an API key is provided, |
|
does extended analysis if user requests ("sample analysis", "extended analysis", etc.), |
|
and returns results/plots in the chatbot. |
|
""" |
|
try: |
|
|
|
df = pd.read_csv(file.name) |
|
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() |
|
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist() |
|
|
|
|
|
summary = ( |
|
f"**File**: {file.name}\n" |
|
f"**Shape**: {df.shape[0]} rows, {df.shape[1]} columns\n" |
|
f"**Numerical Columns**: {', '.join(numeric_cols) if numeric_cols else 'None'}\n" |
|
f"**Categorical Columns**: {', '.join(categorical_cols) if categorical_cols else 'None'}\n" |
|
) |
|
|
|
|
|
llm_suggestions = "" |
|
if api_key: |
|
user_prompt = ( |
|
f"Data Summary:\n{summary}\n\n" |
|
f"User question or request: {message}\n" |
|
f"Suggest advanced data analysis or steps if relevant." |
|
) |
|
llm_response = call_llm(api_provider, api_key, user_prompt) |
|
llm_suggestions = f"\n**LLM Suggestions**:\n{llm_response}\n" |
|
else: |
|
llm_suggestions = "\n(No LLM suggestions because no API key provided.)\n" |
|
|
|
|
|
hist_path = None |
|
if numeric_cols: |
|
plt.figure(figsize=(6, 4)) |
|
sns.histplot(df[numeric_cols[0]], kde=True) |
|
plt.title(f"Distribution of '{numeric_cols[0]}'") |
|
plt.tight_layout() |
|
hist_path = "temp_plot.png" |
|
plt.savefig(hist_path) |
|
plt.close() |
|
|
|
|
|
trigger_phrases = ["sample analysis", "extended analysis", "advanced analysis", "run analysis"] |
|
analysis_paths = [] |
|
accuracy_info = "" |
|
if any(phrase in message.lower() for phrase in trigger_phrases): |
|
analysis_paths, accuracy_info = extended_analysis(df) |
|
|
|
|
|
response_text = summary + llm_suggestions |
|
if accuracy_info: |
|
response_text += f"\n**ML Model Info**: {accuracy_info}\n" |
|
|
|
|
|
chat_content = [(message, response_text)] |
|
if hist_path: |
|
chat_content.append((None, (hist_path,))) |
|
for path in analysis_paths: |
|
chat_content.append((None, (path,))) |
|
|
|
return history + chat_content |
|
|
|
except Exception as e: |
|
return history + [(message, f"Error: {str(e)}")] |
|
|
|
|
|
|
|
|
|
def create_demo(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# 🤖 GPT-4o-mini (Default) + Multi-Provider AI Data Analysis Assistant") |
|
gr.Markdown( |
|
""" |
|
**Features**: |
|
- Default LLM: GPT-4o-mini on Hugging Face (requires HF API key). |
|
- Other providers: **OpenAI**, **DeepSeek**, **Gemini** (enter their respective API keys). |
|
- Upload CSV for data summary & histograms. |
|
- Type "sample analysis" or "extended analysis" to trigger correlation heatmaps, bar plots, and a simple logistic regression. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
api_provider = gr.Dropdown( |
|
choices=["GPT-4o-mini", "OpenAI", "DeepSeek", "Gemini"], |
|
value="GPT-4o-mini", |
|
label="LLM Provider", |
|
) |
|
api_key = gr.Textbox( |
|
label="LLM API Key", |
|
placeholder="Enter your Hugging Face/DeepSeek/OpenAI/Gemini API key here..." |
|
) |
|
|
|
file_input = gr.File(label="Upload CSV File", file_types=[".csv"]) |
|
chatbot = gr.Chatbot(label="Analysis Output") |
|
msg = gr.Textbox( |
|
label="Message", |
|
placeholder="Ask the AI or type 'sample analysis' for extended analysis..." |
|
) |
|
|
|
send_btn = gr.Button("Send") |
|
reset_btn = gr.Button("Reset Chat") |
|
|
|
def reset_chat(): |
|
return [] |
|
|
|
msg.submit( |
|
fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k), |
|
inputs=[file_input, msg, chatbot, api_provider, api_key], |
|
outputs=[chatbot] |
|
).then(lambda: "", None, [msg]) |
|
|
|
send_btn.click( |
|
fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k), |
|
inputs=[file_input, msg, chatbot, api_provider, api_key], |
|
outputs=[chatbot] |
|
).then(lambda: "", None, [msg]) |
|
|
|
reset_btn.click(fn=reset_chat, inputs=[], outputs=[chatbot]) |
|
|
|
demo.queue() |
|
return demo |
|
|
|
|
|
demo = create_demo() |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|