import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.feature_extraction.text import TfidfVectorizer import tempfile from prompts import VALIDATION_PROMPT from typing import List, Optional, Any, Union, Tuple from pathlib import Path from matplotlib.figure import Figure def load_data(file_path: Union[str, Path]) -> pd.DataFrame: """ Load data from an Excel or CSV file Args: file_path (str): Path to the file Returns: pd.DataFrame: Loaded data """ file_ext: str = os.path.splitext(file_path)[1].lower() if file_ext == ".xlsx" or file_ext == ".xls": return pd.read_excel(file_path) elif file_ext == ".csv": return pd.read_csv(file_path) else: raise ValueError( f"Unsupported file format: {file_ext}. Please upload an Excel or CSV file." ) def analyze_text_columns(df: pd.DataFrame) -> List[str]: """ Analyze columns to suggest text columns based on content analysis Args: df (pd.DataFrame): Input dataframe Returns: List[str]: List of suggested text columns """ suggested_text_columns: List[str] = [] for col in df.columns: if df[col].dtype == "object": # String type # Check if column contains mostly text (not just numbers or dates) sample = df[col].head(100).dropna() if len(sample) > 0: # Check if most values contain spaces (indicating text) text_ratio = sum(" " in str(val) for val in sample) / len(sample) if text_ratio > 0.3: # If more than 30% of values contain spaces suggested_text_columns.append(col) # If no columns were suggested, use all object columns if not suggested_text_columns: suggested_text_columns = [col for col in df.columns if df[col].dtype == "object"] return suggested_text_columns def get_sample_texts(df: pd.DataFrame, text_columns: List[str], sample_size: int = 5) -> List[str]: """ Get sample texts from specified columns Args: df (pd.DataFrame): Input dataframe text_columns (List[str]): List of text column names sample_size (int): Number of samples to take from each column Returns: List[str]: List of sample texts """ sample_texts: List[str] = [] for col in text_columns: sample_texts.extend(df[col].head(sample_size).tolist()) return sample_texts def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str: """ Export dataframe to file Args: df (pd.DataFrame): Dataframe to export file_name (str): Name of the output file format_type (str): "excel" or "csv" Returns: str: Path to the exported file """ # Create export directory if it doesn't exist export_dir: str = "exports" os.makedirs(export_dir, exist_ok=True) # Full path for the export file export_path: str = os.path.join(export_dir, file_name) # Export based on format type if format_type == "excel": df.to_excel(export_path, index=False) else: df.to_csv(export_path, index=False) return export_path def visualize_results(df: pd.DataFrame, text_column: str, category_column: str = "Category") -> Figure: """ Create visualization of classification results Args: df (pd.DataFrame): Dataframe with classification results text_column (str): Name of the column containing text data category_column (str): Name of the column containing categories Returns: matplotlib.figure.Figure: Visualization figure """ # Check if category column exists if category_column not in df.columns: # Create a simple figure with a message fig: Figure ax: Any fig, ax = plt.subplots(figsize=(10, 6)) ax.text( 0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12 ) ax.set_title("No Classification Results Available") plt.tight_layout() return fig # Get categories and their counts category_counts: pd.Series = df[category_column].value_counts() # Create a new figure fig: Figure ax: Any fig, ax = plt.subplots(figsize=(10, 6)) # Create the histogram bars: Any = ax.bar(category_counts.index, category_counts.values) # Add value labels on top of each bar for bar in bars: height: float = bar.get_height() ax.text( bar.get_x() + bar.get_width() / 2.0, height, f"{int(height)}", ha="center", va="bottom", ) # Customize the plot ax.set_xlabel("Categories") ax.set_ylabel("Number of Texts") ax.set_title("Distribution of Classified Texts") # Rotate x-axis labels if they're too long plt.xticks(rotation=45, ha="right") # Add grid ax.grid(True, linestyle="--", alpha=0.7) plt.tight_layout() return fig def validate_results(df: pd.DataFrame, text_columns: List[str], client: Any) -> str: """ Use LLM to validate the classification results Args: df (pd.DataFrame): Dataframe with classification results text_columns (list): List of column names containing text data client: LiteLLM client Returns: str: Validation report """ try: # Sample a few rows for validation sample_size: int = min(5, len(df)) sample_df: pd.DataFrame = df.sample(n=sample_size, random_state=42) # Build validation prompts validation_prompts: List[str] = [] for _, row in sample_df.iterrows(): # Combine text from all selected columns text: str = " ".join(str(row[col]) for col in text_columns) assigned_category: str = row["Category"] confidence: float = row["Confidence"] validation_prompts.append( f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n" ) # Use the prompt from prompts.py prompt: str = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts)) # Call LLM API response: Any = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.3, max_tokens=400, ) validation_report: str = response.choices[0].message.content.strip() return validation_report except Exception as e: return f"Validation failed: {str(e)}"