import os import sys import subprocess import requests import gradio as gr import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder # -------------------------------------------------------------------------------- # OPTIONAL: dynamic installation for rarely used packages not in requirements.txt # -------------------------------------------------------------------------------- def install_library(library): """ Install a library using pip. Useful for rarely used packages NOT in requirements.txt. """ try: subprocess.check_call([sys.executable, "-m", "pip", "install", library]) return f"Successfully installed {library}." except Exception as e: return f"Error installing {library}: {str(e)}" def dynamic_import(library, alias=None): """ Dynamically import a library. If not found, try to install it, then import again. """ try: if alias: globals()[alias] = __import__(library) else: globals()[library] = __import__(library) except ImportError: install_msg = install_library(library) print(install_msg) globals()[library] = __import__(library) # -------------------------------------------------------------------------------- # LLM CALLS: GPT-4o-mini, OpenAI, DeepSeek, Gemini # -------------------------------------------------------------------------------- import openai from huggingface_hub import InferenceClient def call_gpt4o_mini(api_key, user_prompt): """ Calls a GPT-4o-mini model hosted on Hugging Face. Replace 'someUser/gpt-4o-mini' with your actual model repo. """ if not api_key: return "No Hugging Face API key provided. Cannot call GPT-4o-mini." try: client = InferenceClient( repo_id="someUser/gpt-4o-mini", # <--- Replace with your real GPT-4o-mini repo token=api_key ) # We use text_generation endpoint; adapt if your model differs response = client.text_generation(user_prompt, max_new_tokens=128) # 'response' can be a string or dict depending on the endpoint. Assume it's a string: return response except Exception as e: return f"Error calling GPT-4o-mini: {str(e)}" def call_openai(api_key, user_prompt): """Calls OpenAI's API (example usage).""" openai.api_key = api_key try: response = openai.Completion.create( model="text-davinci-003", prompt=user_prompt, max_tokens=128 ) return response["choices"][0]["text"].strip() except Exception as e: return f"OpenAI Error: {str(e)}" def call_deepseek(api_key, user_prompt): """ Hypothetical function to call a DeepSeek API endpoint. Replace with real DeepSeek logic as needed. """ try: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } payload = { "prompt": user_prompt, "max_tokens": 128 } # Example POST; adapt to the real DeepSeek endpoint response = requests.post( "https://api.deepseek.ai/v1/chat", json=payload, headers=headers ) response.raise_for_status() data = response.json() return data["choices"][0]["text"].strip() except Exception as e: return f"DeepSeek Error: {str(e)}" def call_gemini(api_key, user_prompt): """ Hypothetical function for Gemini LLM. Replace with real Gemini logic. """ return "(Gemini usage not yet implemented; placeholder)" def call_llm(api_provider, api_key, user_prompt): """Routes calls to the correct LLM provider.""" if not api_key: return "No API key provided. Using GPT-4o-mini default is not possible without HF key." if api_provider.lower() == "gpt-4o-mini" else "No API key provided." provider_lower = api_provider.lower() if provider_lower == "gpt-4o-mini": return call_gpt4o_mini(api_key, user_prompt) elif provider_lower == "openai": return call_openai(api_key, user_prompt) elif provider_lower == "deepseek": return call_deepseek(api_key, user_prompt) elif provider_lower == "gemini": return call_gemini(api_key, user_prompt) else: return f"Unknown provider: {api_provider}. Please choose GPT-4o-mini, OpenAI, DeepSeek, or Gemini." # -------------------------------------------------------------------------------- # ADVANCED DATA ANALYSIS (extended_analysis) # -------------------------------------------------------------------------------- def extended_analysis(df): """ Sample advanced analysis: 1. Correlation heatmap for numeric columns 2. Bar plot of 'Career' (if present) 3. Simple logistic regression classification if 'Career' is suitable """ output_paths = [] numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist() # 1) Correlation Heatmap if len(numeric_cols) > 1: corr = df[numeric_cols].corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") plt.title("Correlation Heatmap") heatmap_path = "heatmap.png" plt.savefig(heatmap_path) plt.close() output_paths.append(heatmap_path) # 2) Bar Plot of 'Career' if present if "Career" in df.columns: plt.figure(figsize=(8, 5)) df["Career"].value_counts().plot(kind="bar") plt.title("Count of Each Career") plt.xlabel("Career") plt.ylabel("Count") barplot_path = "barplot_career.png" plt.savefig(barplot_path) plt.close() output_paths.append(barplot_path) # 3) Simple Logistic Regression if 'Career' exists with multiple categories if "Career" in df.columns and len(numeric_cols) > 0: le = LabelEncoder() df["Career_encoded"] = le.fit_transform(df["Career"]) X = df[numeric_cols].fillna(0) y = df["Career_encoded"] if len(np.unique(y)) > 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) accuracy_info = f"Logistic Regression accuracy on test set: {score:.2f}" else: accuracy_info = "Career column has only one class; no classification performed." else: accuracy_info = "No 'Career' column or insufficient numeric data for classification." return output_paths, accuracy_info # -------------------------------------------------------------------------------- # MAIN ANALYSIS AND VISUALIZATION FUNCTION # -------------------------------------------------------------------------------- def analyze_and_visualize( file, message, history, api_provider, api_key ): """ Loads CSV, gives a summary, calls LLM for suggestions if an API key is provided, does extended analysis if user requests ("sample analysis", "extended analysis", etc.), and returns results/plots in the chatbot. """ try: # Load CSV df = pd.read_csv(file.name) numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist() # Basic info summary = ( f"**File**: {file.name}\n" f"**Shape**: {df.shape[0]} rows, {df.shape[1]} columns\n" f"**Numerical Columns**: {', '.join(numeric_cols) if numeric_cols else 'None'}\n" f"**Categorical Columns**: {', '.join(categorical_cols) if categorical_cols else 'None'}\n" ) # LLM suggestions llm_suggestions = "" if api_key: user_prompt = ( f"Data Summary:\n{summary}\n\n" f"User question or request: {message}\n" f"Suggest advanced data analysis or steps if relevant." ) llm_response = call_llm(api_provider, api_key, user_prompt) llm_suggestions = f"\n**LLM Suggestions**:\n{llm_response}\n" else: llm_suggestions = "\n(No LLM suggestions because no API key provided.)\n" # Always produce example histogram if there's at least one numeric column hist_path = None if numeric_cols: plt.figure(figsize=(6, 4)) sns.histplot(df[numeric_cols[0]], kde=True) plt.title(f"Distribution of '{numeric_cols[0]}'") plt.tight_layout() hist_path = "temp_plot.png" plt.savefig(hist_path) plt.close() # Check if the user wants extended analysis trigger_phrases = ["sample analysis", "extended analysis", "advanced analysis", "run analysis"] analysis_paths = [] accuracy_info = "" if any(phrase in message.lower() for phrase in trigger_phrases): analysis_paths, accuracy_info = extended_analysis(df) # Build final response text response_text = summary + llm_suggestions if accuracy_info: response_text += f"\n**ML Model Info**: {accuracy_info}\n" # Construct the final chatbot content chat_content = [(message, response_text)] if hist_path: chat_content.append((None, (hist_path,))) for path in analysis_paths: chat_content.append((None, (path,))) return history + chat_content except Exception as e: return history + [(message, f"Error: {str(e)}")] # -------------------------------------------------------------------------------- # CREATING THE GRADIO APP # -------------------------------------------------------------------------------- def create_demo(): with gr.Blocks() as demo: gr.Markdown("# 🤖 GPT-4o-mini (Default) + Multi-Provider AI Data Analysis Assistant") gr.Markdown( """ **Features**: - Default LLM: GPT-4o-mini on Hugging Face (requires HF API key). - Other providers: **OpenAI**, **DeepSeek**, **Gemini** (enter their respective API keys). - Upload CSV for data summary & histograms. - Type "sample analysis" or "extended analysis" to trigger correlation heatmaps, bar plots, and a simple logistic regression. """ ) with gr.Row(): api_provider = gr.Dropdown( choices=["GPT-4o-mini", "OpenAI", "DeepSeek", "Gemini"], value="GPT-4o-mini", # default label="LLM Provider", ) api_key = gr.Textbox( label="LLM API Key", placeholder="Enter your Hugging Face/DeepSeek/OpenAI/Gemini API key here..." ) file_input = gr.File(label="Upload CSV File", file_types=[".csv"]) chatbot = gr.Chatbot(label="Analysis Output") msg = gr.Textbox( label="Message", placeholder="Ask the AI or type 'sample analysis' for extended analysis..." ) send_btn = gr.Button("Send") reset_btn = gr.Button("Reset Chat") def reset_chat(): return [] msg.submit( fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k), inputs=[file_input, msg, chatbot, api_provider, api_key], outputs=[chatbot] ).then(lambda: "", None, [msg]) send_btn.click( fn=lambda f, m, h, p, k: analyze_and_visualize(f, m, h or [], p, k), inputs=[file_input, msg, chatbot, api_provider, api_key], outputs=[chatbot] ).then(lambda: "", None, [msg]) reset_btn.click(fn=reset_chat, inputs=[], outputs=[chatbot]) demo.queue() return demo demo = create_demo() if __name__ == "__main__": demo.launch(share=True)