Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on 11 days ago

Commit

535a3a5

1 Parent(s): 85fced4

add tpes

Browse files

Files changed (8) hide show

classifiers/base.py +4 -6
classifiers/llm.py +20 -19
classifiers/tfidf.py +29 -29
client.py +6 -5
process.py +46 -31
server.py +11 -9
test_server.py +8 -7
utils.py +26 -19

classifiers/base.py CHANGED Viewed

@@ -1,5 +1,3 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -15,10 +13,10 @@ from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 class BaseClassifier:
     """Base class for text classifiers"""
-    def __init__(self):
         pass
-    def classify(self, texts, categories=None):
         """
         Classify a list of texts into categories
@@ -31,7 +29,7 @@ class BaseClassifier:
         """
         raise NotImplementedError("Subclasses must implement this method")
-    def _generate_default_categories(self, texts, num_clusters=5):
         """
         Generate default categories based on text clustering
@@ -43,6 +41,6 @@ class BaseClassifier:
             list: List of category names
         """
         # Simple implementation - in real system this would be more sophisticated
-        default_categories = [f"Category {i+1}" for i in range(num_clusters)]
         return default_categories

 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 class BaseClassifier:
     """Base class for text classifiers"""
+    def __init__(self) -> None:
         pass
+    def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
         """
         Classify a list of texts into categories
         """
         raise NotImplementedError("Subclasses must implement this method")
+    def _generate_default_categories(self, texts: List[str], num_clusters: int = 5) -> List[str]:
         """
         Generate default categories based on text clustering
             list: List of category names
         """
         # Simple implementation - in real system this would be more sophisticated
+        default_categories: List[str] = [f"Category {i+1}" for i in range(num_clusters)]
         return default_categories

classifiers/llm.py CHANGED Viewed

@@ -6,9 +6,10 @@ from sklearn.metrics.pairwise import cosine_similarity
 import random
 import json
 import asyncio
-from typing import List, Dict, Any, Optional
 import sys
 import os
 # Add the project root to the Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -20,22 +21,22 @@ from .base import BaseClassifier
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
-    def __init__(self, client, model="gpt-3.5-turbo"):
         super().__init__()
-        self.client = client
-        self.model = model
     async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Async version of text classification"""
-        prompt = TEXT_CLASSIFICATION_PROMPT.format(
             categories=", ".join(categories),
             text=text
         )
         try:
             # Use the synchronous client method but run it in a thread pool
-            loop = asyncio.get_event_loop()
-            response = await loop.run_in_executor(
                 None,
                 lambda: self.client.chat.completions.create(
                     model=self.model,
@@ -46,8 +47,8 @@ class LLMClassifier(BaseClassifier):
             )
             # Parse JSON response
-            response_text = response.choices[0].message.content.strip()
-            result = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
@@ -68,7 +69,7 @@ class LLMClassifier(BaseClassifier):
             return result
         except json.JSONDecodeError:
             # Fall back to simple parsing if JSON fails
-            category = categories[0]  # Default
             for cat in categories:
                 if cat.lower() in response_text.lower():
                     category = cat
@@ -90,16 +91,16 @@ class LLMClassifier(BaseClassifier):
         """Async version of category suggestion"""
         # Take a sample of texts to avoid token limitations
         if len(texts) > sample_size:
-            sample_texts = random.sample(texts, sample_size)
         else:
-            sample_texts = texts
-        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
         try:
             # Use the synchronous client method but run it in a thread pool
-            loop = asyncio.get_event_loop()
-            response = await loop.run_in_executor(
                 None,
                 lambda: self.client.chat.completions.create(
                     model=self.model,
@@ -110,8 +111,8 @@ class LLMClassifier(BaseClassifier):
             )
             # Parse response to get categories
-            categories_text = response.choices[0].message.content.strip()
-            categories = [cat.strip() for cat in categories_text.split(",")]
             return categories
         except Exception as e:
@@ -127,10 +128,10 @@ class LLMClassifier(BaseClassifier):
             categories = await self._suggest_categories_async(texts)
         # Create tasks for all texts
-        tasks = [self._classify_text_async(text, categories) for text in texts]
         # Gather all results
-        results = await asyncio.gather(*tasks)
         return results
     def classify(

 import random
 import json
 import asyncio
+from typing import List, Dict, Any, Optional, Union
 import sys
 import os
+from litellm import OpenAI
 # Add the project root to the Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
+    def __init__(self, client: OpenAI, model: str = "gpt-3.5-turbo") -> None:
         super().__init__()
+        self.client: OpenAI = client
+        self.model: str = model
     async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Async version of text classification"""
+        prompt: str = TEXT_CLASSIFICATION_PROMPT.format(
             categories=", ".join(categories),
             text=text
         )
         try:
             # Use the synchronous client method but run it in a thread pool
+            loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
+            response: Any = await loop.run_in_executor(
                 None,
                 lambda: self.client.chat.completions.create(
                     model=self.model,
             )
             # Parse JSON response
+            response_text: str = response.choices[0].message.content.strip()
+            result: Dict[str, Any] = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
             return result
         except json.JSONDecodeError:
             # Fall back to simple parsing if JSON fails
+            category: str = categories[0]  # Default
             for cat in categories:
                 if cat.lower() in response_text.lower():
                     category = cat
         """Async version of category suggestion"""
         # Take a sample of texts to avoid token limitations
         if len(texts) > sample_size:
+            sample_texts: List[str] = random.sample(texts, sample_size)
         else:
+            sample_texts: List[str] = texts
+        prompt: str = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
         try:
             # Use the synchronous client method but run it in a thread pool
+            loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
+            response: Any = await loop.run_in_executor(
                 None,
                 lambda: self.client.chat.completions.create(
                     model=self.model,
             )
             # Parse response to get categories
+            categories_text: str = response.choices[0].message.content.strip()
+            categories: List[str] = [cat.strip() for cat in categories_text.split(",")]
             return categories
         except Exception as e:
             categories = await self._suggest_categories_async(texts)
         # Create tasks for all texts
+        tasks: List[asyncio.Task] = [self._classify_text_async(text, categories) for text in texts]
         # Gather all results
+        results: List[Dict[str, Any]] = await asyncio.gather(*tasks)
         return results
     def classify(

classifiers/tfidf.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -9,6 +8,7 @@ import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 from .base import BaseClassifier
@@ -16,25 +16,25 @@ from .base import BaseClassifier
 class TFIDFClassifier(BaseClassifier):
     """Classifier using TF-IDF and clustering for fast classification"""
-    def __init__(self):
         super().__init__()
-        self.vectorizer = TfidfVectorizer(
             max_features=1000, stop_words="english", ngram_range=(1, 2)
         )
-        self.model = None
-        self.feature_names = None
-        self.categories = None
-        self.centroids = None
-    def classify(self, texts, categories=None):
         """Classify texts using TF-IDF and clustering"""
         # Vectorize the texts
-        X = self.vectorizer.fit_transform(texts)
         self.feature_names = self.vectorizer.get_feature_names_out()
         # Auto-detect categories if not provided
         if not categories:
-            num_clusters = min(5, len(texts))  # Don't create more clusters than texts
             self.categories = self._generate_default_categories(texts, num_clusters)
         else:
             self.categories = categories
@@ -42,22 +42,22 @@ class TFIDFClassifier(BaseClassifier):
         # Cluster the texts
         self.model = KMeans(n_clusters=num_clusters, random_state=42)
-        clusters = self.model.fit_predict(X)
         self.centroids = self.model.cluster_centers_
         # Calculate distances to centroids for confidence
-        distances = self._calculate_distances(X)
         # Prepare results
-        results = []
         for i, text in enumerate(texts):
-            cluster_idx = clusters[i]
             # Calculate confidence (inverse of distance, normalized)
-            confidence = self._calculate_confidence(distances[i])
             # Create explanation
-            explanation = self._generate_explanation(X[i], cluster_idx)
             results.append(
                 {
@@ -69,7 +69,7 @@ class TFIDFClassifier(BaseClassifier):
         return results
-    def _calculate_distances(self, X):
         """Calculate distances from each point to each centroid"""
         return np.sqrt(
             (
@@ -77,37 +77,37 @@ class TFIDFClassifier(BaseClassifier):
             ).sum(axis=2)
         )
-    def _calculate_confidence(self, distances):
         """Convert distances to confidence scores (0-100)"""
-        min_dist = np.min(distances)
-        max_dist = np.max(distances)
         # Normalize and invert (smaller distance = higher confidence)
         if max_dist == min_dist:
             return 70  # Default mid-range confidence when all distances are equal
-        normalized_dist = (distances - min_dist) / (max_dist - min_dist)
-        min_normalized = np.min(normalized_dist)
         # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
-        confidence = 100 - (min_normalized * 50)
         return round(confidence, 1)
-    def _generate_explanation(self, text_vector, cluster_idx):
         """Generate an explanation for the classification"""
         # Get the most important features for this cluster
-        centroid = self.centroids[cluster_idx]
         # Get indices of top features for this text
-        text_array = text_vector.toarray()[0]
-        top_indices = text_array.argsort()[-5:][::-1]
         # Get the feature names for these indices
-        top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
         if not top_features:
             return "No significant features identified for this classification."
-        explanation = f"Classification based on key terms: {', '.join(top_features)}"
         return explanation

 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
+from scipy.sparse import csr_matrix
 from .base import BaseClassifier
 class TFIDFClassifier(BaseClassifier):
     """Classifier using TF-IDF and clustering for fast classification"""
+    def __init__(self) -> None:
         super().__init__()
+        self.vectorizer: TfidfVectorizer = TfidfVectorizer(
             max_features=1000, stop_words="english", ngram_range=(1, 2)
         )
+        self.model: Optional[KMeans] = None
+        self.feature_names: Optional[np.ndarray] = None
+        self.categories: Optional[List[str]] = None
+        self.centroids: Optional[np.ndarray] = None
+    def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
         """Classify texts using TF-IDF and clustering"""
         # Vectorize the texts
+        X: csr_matrix = self.vectorizer.fit_transform(texts)
         self.feature_names = self.vectorizer.get_feature_names_out()
         # Auto-detect categories if not provided
         if not categories:
+            num_clusters: int = min(5, len(texts))  # Don't create more clusters than texts
             self.categories = self._generate_default_categories(texts, num_clusters)
         else:
             self.categories = categories
         # Cluster the texts
         self.model = KMeans(n_clusters=num_clusters, random_state=42)
+        clusters: np.ndarray = self.model.fit_predict(X)
         self.centroids = self.model.cluster_centers_
         # Calculate distances to centroids for confidence
+        distances: np.ndarray = self._calculate_distances(X)
         # Prepare results
+        results: List[Dict[str, Any]] = []
         for i, text in enumerate(texts):
+            cluster_idx: int = clusters[i]
             # Calculate confidence (inverse of distance, normalized)
+            confidence: float = self._calculate_confidence(distances[i])
             # Create explanation
+            explanation: str = self._generate_explanation(X[i], cluster_idx)
             results.append(
                 {
         return results
+    def _calculate_distances(self, X: csr_matrix) -> np.ndarray:
         """Calculate distances from each point to each centroid"""
         return np.sqrt(
             (
             ).sum(axis=2)
         )
+    def _calculate_confidence(self, distances: np.ndarray) -> float:
         """Convert distances to confidence scores (0-100)"""
+        min_dist: float = np.min(distances)
+        max_dist: float = np.max(distances)
         # Normalize and invert (smaller distance = higher confidence)
         if max_dist == min_dist:
             return 70  # Default mid-range confidence when all distances are equal
+        normalized_dist: np.ndarray = (distances - min_dist) / (max_dist - min_dist)
+        min_normalized: float = np.min(normalized_dist)
         # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
+        confidence: float = 100 - (min_normalized * 50)
         return round(confidence, 1)
+    def _generate_explanation(self, text_vector: csr_matrix, cluster_idx: int) -> str:
         """Generate an explanation for the classification"""
         # Get the most important features for this cluster
+        centroid: np.ndarray = self.centroids[cluster_idx]
         # Get indices of top features for this text
+        text_array: np.ndarray = text_vector.toarray()[0]
+        top_indices: np.ndarray = text_array.argsort()[-5:][::-1]
         # Get the feature names for these indices
+        top_features: List[str] = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
         if not top_features:
             return "No significant features identified for this classification."
+        explanation: str = f"Classification based on key terms: {', '.join(top_features)}"
         return explanation

client.py CHANGED Viewed

@@ -1,19 +1,20 @@
 from litellm import OpenAI
 import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 # Initialize client as None
-client = None
-def get_client():
     """Get the OpenAI client instance"""
     global client
     return client
-def initialize_client(api_key=None):
     """Initialize the OpenAI client with an API key"""
     global client
     import logging
@@ -28,7 +29,7 @@ def initialize_client(api_key=None):
     try:
         client = OpenAI(api_key=api_key)
         # Test the connection with a simple request
-        response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "test"}],
             max_tokens=5,
@@ -37,6 +38,6 @@ def initialize_client(api_key=None):
         return True, "API Key updated and verified successfully"
     except Exception as e:
         client = None
-        error_message = f"Failed to initialize client: {str(e)}"
         logging.error(error_message)
         return False, error_message

 from litellm import OpenAI
 import os
 from dotenv import load_dotenv
+from typing import Optional, Tuple, Any
 # Load environment variables
 load_dotenv()
 # Initialize client as None
+client: Optional[OpenAI] = None
+def get_client() -> Optional[OpenAI]:
     """Get the OpenAI client instance"""
     global client
     return client
+def initialize_client(api_key: Optional[str] = None) -> Tuple[bool, str]:
     """Initialize the OpenAI client with an API key"""
     global client
     import logging
     try:
         client = OpenAI(api_key=api_key)
         # Test the connection with a simple request
+        response: Any = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "test"}],
             max_tokens=5,
         return True, "API Key updated and verified successfully"
     except Exception as e:
         client = None
+        error_message: str = f"Failed to initialize client: {str(e)}"
         logging.error(error_message)
         return False, error_message

process.py CHANGED Viewed

@@ -3,36 +3,45 @@ import time
 import traceback
 import asyncio
 from sklearn.feature_extraction.text import TfidfVectorizer
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, validate_results
 from client import get_client
-def update_api_key(api_key):
     """Update the OpenAI API key"""
     from client import initialize_client
     return initialize_client(api_key)
-async def process_file_async(file, text_columns, categories, classifier_type, show_explanations):
     """Async version of process_file"""
     # Initialize result_df and validation_report
-    result_df = None
-    validation_report = None
     try:
         # Load data from file
         if isinstance(file, str):
-            df = load_data(file)
         else:
-            df = load_data(file.name)
         if not text_columns:
             return None, "Please select at least one text column"
         # Check if all selected columns exist
-        missing_columns = [col for col in text_columns if col not in df.columns]
         if missing_columns:
             return (
                 None,
@@ -40,18 +49,18 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
             )
         # Combine text from selected columns
-        texts = []
         for _, row in df.iterrows():
-            combined_text = " ".join(str(row[col]) for col in text_columns)
             texts.append(combined_text)
         # Parse categories if provided
-        category_list = []
         if categories:
             category_list = [cat.strip() for cat in categories.split(",")]
         # Select classifier based on data size and user choice
-        num_texts = len(texts)
         # If no specific model is chosen, select the most appropriate one
         if classifier_type == "auto":
@@ -69,17 +78,17 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
-            classifier = TFIDFClassifier()
-            results = classifier.classify(texts, category_list)
         elif classifier_type in ["gpt35", "gpt4"]:
             if client is None:
                 return (
                     None,
                     "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
                 )
-            model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
-            classifier = LLMClassifier(client=client, model=model)
-            results = await classifier.classify_async(texts, category_list)
         else:  # hybrid
             if client is None:
                 return (
@@ -87,14 +96,14 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
                     "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
                 )
             # First pass with TF-IDF
-            tfidf_classifier = TFIDFClassifier()
-            tfidf_results = tfidf_classifier.classify(texts, category_list)
             # Second pass with LLM for low confidence results
-            llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
-            results = []
-            low_confidence_texts = []
-            low_confidence_indices = []
             for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
                 if tfidf_result["confidence"] < 70:  # If confidence is below 70%
@@ -105,7 +114,7 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
                     results.append(tfidf_result)
             if low_confidence_texts:
-                llm_results = await llm_classifier.classify_async(
                     low_confidence_texts, category_list
                 )
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
@@ -125,16 +134,22 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
         return result_df, validation_report
     except Exception as e:
-        error_traceback = traceback.format_exc()
         return None, f"Error: {str(e)}\n{error_traceback}"
-def process_file(file, text_columns, categories, classifier_type, show_explanations):
     """Synchronous wrapper for process_file_async"""
     return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
-def export_results(df, format_type):
     """Export results to a file and return the file path for download"""
     if df is None:
         return None
@@ -144,18 +159,18 @@ def export_results(df, format_type):
     import os
     # Create a temporary directory if it doesn't exist
-    temp_dir = "temp_exports"
     os.makedirs(temp_dir, exist_ok=True)
     # Generate a unique filename
-    timestamp = time.strftime("%Y%m%d-%H%M%S")
-    filename = f"classification_results_{timestamp}"
     if format_type == "excel":
-        file_path = os.path.join(temp_dir, f"{filename}.xlsx")
         df.to_excel(file_path, index=False)
     else:
-        file_path = os.path.join(temp_dir, f"{filename}.csv")
         df.to_csv(file_path, index=False)
     return file_path

 import traceback
 import asyncio
 from sklearn.feature_extraction.text import TfidfVectorizer
+from typing import Optional, List, Dict, Any, Tuple, Union
+import pandas as pd
+from pathlib import Path
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, validate_results
 from client import get_client
+def update_api_key(api_key: str) -> Tuple[bool, str]:
     """Update the OpenAI API key"""
     from client import initialize_client
     return initialize_client(api_key)
+async def process_file_async(
+    file: Union[str, Path],
+    text_columns: List[str],
+    categories: Optional[str],
+    classifier_type: str,
+    show_explanations: bool
+) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
     """Async version of process_file"""
     # Initialize result_df and validation_report
+    result_df: Optional[pd.DataFrame] = None
+    validation_report: Optional[str] = None
     try:
         # Load data from file
         if isinstance(file, str):
+            df: pd.DataFrame = load_data(file)
         else:
+            df: pd.DataFrame = load_data(file.name)
         if not text_columns:
             return None, "Please select at least one text column"
         # Check if all selected columns exist
+        missing_columns: List[str] = [col for col in text_columns if col not in df.columns]
         if missing_columns:
             return (
                 None,
             )
         # Combine text from selected columns
+        texts: List[str] = []
         for _, row in df.iterrows():
+            combined_text: str = " ".join(str(row[col]) for col in text_columns)
             texts.append(combined_text)
         # Parse categories if provided
+        category_list: List[str] = []
         if categories:
             category_list = [cat.strip() for cat in categories.split(",")]
         # Select classifier based on data size and user choice
+        num_texts: int = len(texts)
         # If no specific model is chosen, select the most appropriate one
         if classifier_type == "auto":
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
+            classifier: TFIDFClassifier = TFIDFClassifier()
+            results: List[Dict[str, Any]] = classifier.classify(texts, category_list)
         elif classifier_type in ["gpt35", "gpt4"]:
             if client is None:
                 return (
                     None,
                     "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
                 )
+            model: str = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
+            classifier: LLMClassifier = LLMClassifier(client=client, model=model)
+            results: List[Dict[str, Any]] = await classifier.classify_async(texts, category_list)
         else:  # hybrid
             if client is None:
                 return (
                     "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
                 )
             # First pass with TF-IDF
+            tfidf_classifier: TFIDFClassifier = TFIDFClassifier()
+            tfidf_results: List[Dict[str, Any]] = tfidf_classifier.classify(texts, category_list)
             # Second pass with LLM for low confidence results
+            llm_classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
+            results: List[Optional[Dict[str, Any]]] = []
+            low_confidence_texts: List[str] = []
+            low_confidence_indices: List[int] = []
             for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
                 if tfidf_result["confidence"] < 70:  # If confidence is below 70%
                     results.append(tfidf_result)
             if low_confidence_texts:
+                llm_results: List[Dict[str, Any]] = await llm_classifier.classify_async(
                     low_confidence_texts, category_list
                 )
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
         return result_df, validation_report
     except Exception as e:
+        error_traceback: str = traceback.format_exc()
         return None, f"Error: {str(e)}\n{error_traceback}"
+def process_file(
+    file: Union[str, Path],
+    text_columns: List[str],
+    categories: Optional[str],
+    classifier_type: str,
+    show_explanations: bool
+) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
     """Synchronous wrapper for process_file_async"""
     return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
+def export_results(df: pd.DataFrame, format_type: str) -> Optional[str]:
     """Export results to a file and return the file path for download"""
     if df is None:
         return None
     import os
     # Create a temporary directory if it doesn't exist
+    temp_dir: str = "temp_exports"
     os.makedirs(temp_dir, exist_ok=True)
     # Generate a unique filename
+    timestamp: str = time.strftime("%Y%m%d-%H%M%S")
+    filename: str = f"classification_results_{timestamp}"
     if format_type == "excel":
+        file_path: str = os.path.join(temp_dir, f"{filename}.xlsx")
         df.to_excel(file_path, index=False)
     else:
+        file_path: str = os.path.join(temp_dir, f"{filename}.csv")
         df.to_csv(file_path, index=False)
     return file_path

server.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from typing import List, Optional
 import json
 from classifiers.llm import LLMClassifier
 from litellm import completion
@@ -13,7 +13,7 @@ from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
-app = FastAPI()
 # Configure CORS
 app.add_middleware(
@@ -25,8 +25,10 @@ app.add_middleware(
 )
 # Initialize client with API key from environment
-api_key = os.environ.get("OPENAI_API_KEY")
 if api_key:
     success, message = initialize_client(api_key)
     if not success:
         raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
@@ -36,7 +38,7 @@ if not client:
     raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
 # Initialize the LLM classifier
-classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
 class TextInput(BaseModel):
     text: str
@@ -51,14 +53,14 @@ class CategorySuggestionResponse(BaseModel):
     categories: List[str]
 @app.post("/classify", response_model=ClassificationResponse)
-async def classify_text(text_input: TextInput):
     try:
         # Use async classification
-        results = await classifier.classify_async(
             [text_input.text],
             text_input.categories
         )
-        result = results[0]  # Get first result since we're classifying one text
         return ClassificationResponse(
             category=result["category"],
@@ -69,9 +71,9 @@ async def classify_text(text_input: TextInput):
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/suggest-categories", response_model=CategorySuggestionResponse)
-async def suggest_categories(texts: List[str]):
     try:
-        categories = await classifier._suggest_categories_async(texts)
         return CategorySuggestionResponse(categories=categories)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from typing import List, Optional, Dict, Any, Tuple
 import json
 from classifiers.llm import LLMClassifier
 from litellm import completion
 # Load environment variables
 load_dotenv()
+app: FastAPI = FastAPI()
 # Configure CORS
 app.add_middleware(
 )
 # Initialize client with API key from environment
+api_key: Optional[str] = os.environ.get("OPENAI_API_KEY")
 if api_key:
+    success: bool
+    message: str
     success, message = initialize_client(api_key)
     if not success:
         raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
     raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
 # Initialize the LLM classifier
+classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
 class TextInput(BaseModel):
     text: str
     categories: List[str]
 @app.post("/classify", response_model=ClassificationResponse)
+async def classify_text(text_input: TextInput) -> ClassificationResponse:
     try:
         # Use async classification
+        results: List[Dict[str, Any]] = await classifier.classify_async(
             [text_input.text],
             text_input.categories
         )
+        result: Dict[str, Any] = results[0]  # Get first result since we're classifying one text
         return ClassificationResponse(
             category=result["category"],
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/suggest-categories", response_model=CategorySuggestionResponse)
+async def suggest_categories(texts: List[str]) -> CategorySuggestionResponse:
     try:
+        categories: List[str] = await classifier._suggest_categories_async(texts)
         return CategorySuggestionResponse(categories=categories)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

test_server.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import requests
 import json
-BASE_URL = "http://localhost:8000"
-def test_classify_text():
     # Load emails from CSV file
     import csv
-    emails = []
     with open("examples/emails.csv", "r", encoding="utf-8") as file:
         reader = csv.DictReader(file)
         for row in reader:
@@ -15,7 +16,7 @@ def test_classify_text():
     # Test with default categories using email content
     for email in emails[:5]:
-        response = requests.post(
             f"{BASE_URL}/classify",
             json={"text": email["contenu"]}
         )
@@ -23,11 +24,11 @@ def test_classify_text():
         print(json.dumps(response.json(), indent=2))
-def test_suggest_categories():
     # Load reviews from CSV file
     import csv
-    texts = []
     with open("examples/reviews.csv", "r", encoding="utf-8") as file:
         reader = csv.DictReader(file)
         for row in reader:
@@ -35,7 +36,7 @@ def test_suggest_categories():
     # Use the first few reviews for testing
     texts = texts[:5]
-    response = requests.post(
         f"{BASE_URL}/suggest-categories",
         json=texts
     )

 import requests
 import json
+from typing import List, Dict, Any, Optional
+BASE_URL: str = "http://localhost:8000"
+def test_classify_text() -> None:
     # Load emails from CSV file
     import csv
+    emails: List[Dict[str, str]] = []
     with open("examples/emails.csv", "r", encoding="utf-8") as file:
         reader = csv.DictReader(file)
         for row in reader:
     # Test with default categories using email content
     for email in emails[:5]:
+        response: requests.Response = requests.post(
             f"{BASE_URL}/classify",
             json={"text": email["contenu"]}
         )
         print(json.dumps(response.json(), indent=2))
+def test_suggest_categories() -> None:
     # Load reviews from CSV file
     import csv
+    texts: List[str] = []
     with open("examples/reviews.csv", "r", encoding="utf-8") as file:
         reader = csv.DictReader(file)
         for row in reader:
     # Use the first few reviews for testing
     texts = texts[:5]
+    response: requests.Response = requests.post(
         f"{BASE_URL}/suggest-categories",
         json=texts
     )

utils.py CHANGED Viewed

@@ -6,9 +6,12 @@ from sklearn.decomposition import PCA
 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 from prompts import VALIDATION_PROMPT
-def load_data(file_path):
     """
     Load data from an Excel or CSV file
@@ -18,7 +21,7 @@ def load_data(file_path):
     Returns:
         pd.DataFrame: Loaded data
     """
-    file_ext = os.path.splitext(file_path)[1].lower()
     if file_ext == ".xlsx" or file_ext == ".xls":
         return pd.read_excel(file_path)
@@ -30,7 +33,7 @@ def load_data(file_path):
         )
-def export_data(df, file_name, format_type="excel"):
     """
     Export dataframe to file
@@ -43,11 +46,11 @@ def export_data(df, file_name, format_type="excel"):
         str: Path to the exported file
     """
     # Create export directory if it doesn't exist
-    export_dir = "exports"
     os.makedirs(export_dir, exist_ok=True)
     # Full path for the export file
-    export_path = os.path.join(export_dir, file_name)
     # Export based on format type
     if format_type == "excel":
@@ -58,7 +61,7 @@ def export_data(df, file_name, format_type="excel"):
     return export_path
-def visualize_results(df, text_column, category_column="Category"):
     """
     Create visualization of classification results
@@ -73,6 +76,8 @@ def visualize_results(df, text_column, category_column="Category"):
     # Check if category column exists
     if category_column not in df.columns:
         # Create a simple figure with a message
         fig, ax = plt.subplots(figsize=(10, 6))
         ax.text(
             0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
@@ -82,17 +87,19 @@ def visualize_results(df, text_column, category_column="Category"):
         return fig
     # Get categories and their counts
-    category_counts = df[category_column].value_counts()
     # Create a new figure
     fig, ax = plt.subplots(figsize=(10, 6))
     # Create the histogram
-    bars = ax.bar(category_counts.index, category_counts.values)
     # Add value labels on top of each bar
     for bar in bars:
-        height = bar.get_height()
         ax.text(
             bar.get_x() + bar.get_width() / 2.0,
             height,
@@ -117,7 +124,7 @@ def visualize_results(df, text_column, category_column="Category"):
     return fig
-def validate_results(df, text_columns, client):
     """
     Use LLM to validate the classification results
@@ -131,33 +138,33 @@ def validate_results(df, text_columns, client):
     """
     try:
         # Sample a few rows for validation
-        sample_size = min(5, len(df))
-        sample_df = df.sample(n=sample_size, random_state=42)
         # Build validation prompts
-        validation_prompts = []
         for _, row in sample_df.iterrows():
             # Combine text from all selected columns
-            text = " ".join(str(row[col]) for col in text_columns)
-            assigned_category = row["Category"]
-            confidence = row["Confidence"]
             validation_prompts.append(
                 f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
             )
         # Use the prompt from prompts.py
-        prompt = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
         # Call LLM API
-        response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.3,
             max_tokens=400,
         )
-        validation_report = response.choices[0].message.content.strip()
         return validation_report
     except Exception as e:

 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 from prompts import VALIDATION_PROMPT
+from typing import List, Optional, Any, Union
+from pathlib import Path
+from matplotlib.figure import Figure
+def load_data(file_path: Union[str, Path]) -> pd.DataFrame:
     """
     Load data from an Excel or CSV file
     Returns:
         pd.DataFrame: Loaded data
     """
+    file_ext: str = os.path.splitext(file_path)[1].lower()
     if file_ext == ".xlsx" or file_ext == ".xls":
         return pd.read_excel(file_path)
         )
+def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str:
     """
     Export dataframe to file
         str: Path to the exported file
     """
     # Create export directory if it doesn't exist
+    export_dir: str = "exports"
     os.makedirs(export_dir, exist_ok=True)
     # Full path for the export file
+    export_path: str = os.path.join(export_dir, file_name)
     # Export based on format type
     if format_type == "excel":
     return export_path
+def visualize_results(df: pd.DataFrame, text_column: str, category_column: str = "Category") -> Figure:
     """
     Create visualization of classification results
     # Check if category column exists
     if category_column not in df.columns:
         # Create a simple figure with a message
+        fig: Figure
+        ax: Any
         fig, ax = plt.subplots(figsize=(10, 6))
         ax.text(
             0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
         return fig
     # Get categories and their counts
+    category_counts: pd.Series = df[category_column].value_counts()
     # Create a new figure
+    fig: Figure
+    ax: Any
     fig, ax = plt.subplots(figsize=(10, 6))
     # Create the histogram
+    bars: Any = ax.bar(category_counts.index, category_counts.values)
     # Add value labels on top of each bar
     for bar in bars:
+        height: float = bar.get_height()
         ax.text(
             bar.get_x() + bar.get_width() / 2.0,
             height,
     return fig
+def validate_results(df: pd.DataFrame, text_columns: List[str], client: Any) -> str:
     """
     Use LLM to validate the classification results
     """
     try:
         # Sample a few rows for validation
+        sample_size: int = min(5, len(df))
+        sample_df: pd.DataFrame = df.sample(n=sample_size, random_state=42)
         # Build validation prompts
+        validation_prompts: List[str] = []
         for _, row in sample_df.iterrows():
             # Combine text from all selected columns
+            text: str = " ".join(str(row[col]) for col in text_columns)
+            assigned_category: str = row["Category"]
+            confidence: float = row["Confidence"]
             validation_prompts.append(
                 f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
             )
         # Use the prompt from prompts.py
+        prompt: str = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
         # Call LLM API
+        response: Any = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.3,
             max_tokens=400,
         )
+        validation_report: str = response.choices[0].message.content.strip()
         return validation_report
     except Exception as e: