Spaces:

jzou19950715
/

Lossdog_Data_Science_Expert

Sleeping

App Files Files Community

Lossdog_Data_Science_Expert / app.py

jzou19950715

Create app.py

bcd9ccf verified 4 months ago

raw

history blame

12.2 kB

	import os
	import sys
	import subprocess
	import requests
	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np

	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.preprocessing import LabelEncoder

	# --------------------------------------------------------------------------------
	# OPTIONAL: dynamic installation for rarely used packages not in requirements.txt
	# --------------------------------------------------------------------------------

	def install_library(library):
	"""
	Install a library using pip.
	Useful for rarely used packages NOT in requirements.txt.
	"""
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", library])
	return f"Successfully installed {library}."
	except Exception as e:
	return f"Error installing {library}: {str(e)}"

	def dynamic_import(library, alias=None):
	"""
	Dynamically import a library. If not found, try to install it, then import again.
	"""
	try:
	if alias:
	globals()[alias] = __import__(library)
	else:
	globals()[library] = __import__(library)
	except ImportError:
	install_msg = install_library(library)
	print(install_msg)
	globals()[library] = __import__(library)


	# --------------------------------------------------------------------------------
	# LLM CALLS: GPT-4o-mini, OpenAI, DeepSeek, Gemini
	# --------------------------------------------------------------------------------
	import openai
	from huggingface_hub import InferenceClient

	def call_gpt4o_mini(api_key, user_prompt):
	"""
	Calls a GPT-4o-mini model hosted on Hugging Face.
	Replace 'someUser/gpt-4o-mini' with your actual model repo.
	"""
	if not api_key:
	return "No Hugging Face API key provided. Cannot call GPT-4o-mini."

	try:
	client = InferenceClient(
	repo_id="someUser/gpt-4o-mini", # <--- Replace with your real GPT-4o-mini repo
	token=api_key
	)
	# We use text_generation endpoint; adapt if your model differs
	response = client.text_generation(user_prompt, max_new_tokens=128)
	# 'response' can be a string or dict depending on the endpoint. Assume it's a string:
	return response
	except Exception as e:
	return f"Error calling GPT-4o-mini: {str(e)}"

	def call_openai(api_key, user_prompt):
	"""Calls OpenAI's API (example usage)."""
	openai.api_key = api_key
	try:
	response = openai.Completion.create(
	model="text-davinci-003",
	prompt=user_prompt,
	max_tokens=128
	)
	return response["choices"][0]["text"].strip()
	except Exception as e:
	return f"OpenAI Error: {str(e)}"

	def call_deepseek(api_key, user_prompt):
	"""
	Hypothetical function to call a DeepSeek API endpoint.
	Replace with real DeepSeek logic as needed.
	"""
	try:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}
	payload = {
	"prompt": user_prompt,
	"max_tokens": 128
	}
	# Example POST; adapt to the real DeepSeek endpoint
	response = requests.post(
	"https://api.deepseek.ai/v1/chat",
	json=payload,
	headers=headers
	)
	response.raise_for_status()
	data = response.json()
	return data["choices"][0]["text"].strip()
	except Exception as e:
	return f"DeepSeek Error: {str(e)}"

	def call_gemini(api_key, user_prompt):
	"""
	Hypothetical function for Gemini LLM.
	Replace with real Gemini logic.
	"""
	return "(Gemini usage not yet implemented; placeholder)"

	def call_llm(api_provider, api_key, user_prompt):
	"""Routes calls to the correct LLM provider."""
	if not api_key:
	return "No API key provided. Using GPT-4o-mini default is not possible without HF key." if api_provider.lower() == "gpt-4o-mini" else "No API key provided."

	provider_lower = api_provider.lower()
	if provider_lower == "gpt-4o-mini":
	return call_gpt4o_mini(api_key, user_prompt)
	elif provider_lower == "openai":
	return call_openai(api_key, user_prompt)
	elif provider_lower == "deepseek":
	return call_deepseek(api_key, user_prompt)
	elif provider_lower == "gemini":
	return call_gemini(api_key, user_prompt)
	else:
	return f"Unknown provider: {api_provider}. Please choose GPT-4o-mini, OpenAI, DeepSeek, or Gemini."

	# --------------------------------------------------------------------------------
	# ADVANCED DATA ANALYSIS (extended_analysis)
	# --------------------------------------------------------------------------------
	def extended_analysis(df):
	"""
	Sample advanced analysis:
	1. Correlation heatmap for numeric columns
	2. Bar plot of 'Career' (if present)
	3. Simple logistic regression classification if 'Career' is suitable
	"""
	output_paths = []
	numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()

	cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

	# 1) Correlation Heatmap
	if len(numeric_cols) > 1:
	corr = df[numeric_cols].corr()
	plt.figure(figsize=(8, 6))
	sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
	plt.title("Correlation Heatmap")
	heatmap_path = "heatmap.png"
	plt.savefig(heatmap_path)
	plt.close()
	output_paths.append(heatmap_path)

	# 2) Bar Plot of 'Career' if present
	if "Career" in df.columns:
	plt.figure(figsize=(8, 5))
	df["Career"].value_counts().plot(kind="bar")
	plt.title("Count of Each Career")
	plt.xlabel("Career")
	plt.ylabel("Count")
	barplot_path = "barplot_career.png"
	plt.savefig(barplot_path)
	plt.close()
	output_paths.append(barplot_path)

	# 3) Simple Logistic Regression if 'Career' exists with multiple categories
	if "Career" in df.columns and len(numeric_cols) > 0:
	le = LabelEncoder()
	df["Career_encoded"] = le.fit_transform(df["Career"])
	X = df[numeric_cols].fillna(0)
	y = df["Career_encoded"]
	if len(np.unique(y)) > 1:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	model = LogisticRegression(max_iter=1000)
	model.fit(X_train, y_train)
	score = model.score(X_test, y_test)
	accuracy_info = f"Logistic Regression accuracy on test set: {score:.2f}"
	else:
	accuracy_info = "Career column has only one class; no classification performed."
	else:
	accuracy_info = "No 'Career' column or insufficient numeric data for classification."

	return output_paths, accuracy_info

	# --------------------------------------------------------------------------------
	# MAIN ANALYSIS AND VISUALIZATION FUNCTION
	# --------------------------------------------------------------------------------
	def analyze_and_visualize(
	file,
	message,
	history,
	api_provider,
	api_key
	):
	"""
	Loads CSV, gives a summary, calls LLM for suggestions if an API key is provided,
	does extended analysis if user requests ("sample analysis", "extended analysis", etc.),
	and returns results/plots in the chatbot.
	"""
	try:
	# Load CSV
	df = pd.read_csv(file.name)
	numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
	categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

	# Basic info
	summary = (
	f"File: {file.name}\n"
	f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n"
	f"Numerical Columns: {', '.join(numeric_cols) if numeric_cols else 'None'}\n"
	f"Categorical Columns: {', '.join(categorical_cols) if categorical_cols else 'None'}\n"
	)

	# LLM suggestions
	llm_suggestions = ""
	if api_key:
	user_prompt = (
	f"Data Summary:\n{summary}\n\n"
	f"User question or request: {message}\n"
	f"Suggest advanced data analysis or steps if relevant."
	)
	llm_response = call_llm(api_provider, api_key, user_prompt)
	llm_suggestions = f"\nLLM Suggestions:\n{llm_response}\n"
	else:
	llm_suggestions = "\n(No LLM suggestions because no API key provided.)\n"

	# Always produce example histogram if there's at least one numeric column
	hist_path = None
	if numeric_cols:
	plt.figure(figsize=(6, 4))
	sns.histplot(df[numeric_cols[0]], kde=True)
	plt.title(f"Distribution of '{numeric_cols[0]}'")
	plt.tight_layout()
	hist_path = "temp_plot.png"
	plt.savefig(hist_path)
	plt.close()

	# Check if the user wants extended analysis
	trigger_phrases = ["sample analysis", "extended analysis", "advanced analysis", "run analysis"]
	analysis_paths = []
	accuracy_info = ""
	if any(phrase in message.lower() for phrase in trigger_phrases):
	analysis_paths, accuracy_info = extended_analysis(df)

	# Build final response text
	response_text = summary + llm_suggestions
	if accuracy_info:
	response_text += f"\nML Model Info: {accuracy_info}\n"

	# Construct the final chatbot content
	chat_content = [(message, response_text)]
	if hist_path:
	chat_content.append((None, (hist_path,)))
	for path in analysis_paths:
	chat_content.append((None, (path,)))

	return history + chat_content

	except Exception as e:
	return history + [(message, f"Error: {str(e)}")]

	# --------------------------------------------------------------------------------
	# CREATING THE GRADIO APP
	# --------------------------------------------------------------------------------
	def create_demo():
	with gr.Blocks() as demo:
	gr.Markdown("# 🤖 GPT-4o-mini (Default) + Multi-Provider AI Data Analysis Assistant")
	gr.Markdown(
	"""
	Features:
	- Default LLM: GPT-4o-mini on Hugging Face (requires HF API key).
	- Other providers: OpenAI, DeepSeek, Gemini (enter their respective API keys).
	- Upload CSV for data summary & histograms.
	- Type "sample analysis" or "extended analysis" to trigger correlation heatmaps, bar plots, and a simple logistic regression.
	"""
	)

	with gr.Row():
	api_provider = gr.Dropdown(
	choices=["GPT-4o-mini", "OpenAI", "DeepSeek", "Gemini"],
	value="GPT-4o-mini", # default
	label="LLM Provider",
	)
	api_key = gr.Textbox(
	label="LLM API Key",
	placeholder="Enter your Hugging Face/DeepSeek/OpenAI/Gemini API key here..."
	)

	file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
	chatbot = gr.Chatbot(label="Analysis Output")
	msg = gr.Textbox(
	label="Message",
	placeholder="Ask the AI or type 'sample analysis' for extended analysis..."
	)

	send_btn = gr.Button("Send")
	reset_btn = gr.Button("Reset Chat")

	def reset_chat():
	return []

	msg.submit(
	fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
	inputs=[file_input, msg, chatbot, api_provider, api_key],
	outputs=[chatbot]
	).then(lambda: "", None, [msg])

	send_btn.click(
	fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k),
	inputs=[file_input, msg, chatbot, api_provider, api_key],
	outputs=[chatbot]
	).then(lambda: "", None, [msg])

	reset_btn.click(fn=reset_chat, inputs=[], outputs=[chatbot])

	demo.queue()
	return demo


	demo = create_demo()

	if __name__ == "__main__":
	demo.launch(share=True)