simondh commited on
Commit
535a3a5
·
1 Parent(s): 85fced4
Files changed (8) hide show
  1. classifiers/base.py +4 -6
  2. classifiers/llm.py +20 -19
  3. classifiers/tfidf.py +29 -29
  4. client.py +6 -5
  5. process.py +46 -31
  6. server.py +11 -9
  7. test_server.py +8 -7
  8. utils.py +26 -19
classifiers/base.py CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  import numpy as np
4
  import pandas as pd
5
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -15,10 +13,10 @@ from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
15
  class BaseClassifier:
16
  """Base class for text classifiers"""
17
 
18
- def __init__(self):
19
  pass
20
 
21
- def classify(self, texts, categories=None):
22
  """
23
  Classify a list of texts into categories
24
 
@@ -31,7 +29,7 @@ class BaseClassifier:
31
  """
32
  raise NotImplementedError("Subclasses must implement this method")
33
 
34
- def _generate_default_categories(self, texts, num_clusters=5):
35
  """
36
  Generate default categories based on text clustering
37
 
@@ -43,6 +41,6 @@ class BaseClassifier:
43
  list: List of category names
44
  """
45
  # Simple implementation - in real system this would be more sophisticated
46
- default_categories = [f"Category {i+1}" for i in range(num_clusters)]
47
  return default_categories
48
 
 
 
 
1
  import numpy as np
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
 
13
  class BaseClassifier:
14
  """Base class for text classifiers"""
15
 
16
+ def __init__(self) -> None:
17
  pass
18
 
19
+ def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
20
  """
21
  Classify a list of texts into categories
22
 
 
29
  """
30
  raise NotImplementedError("Subclasses must implement this method")
31
 
32
+ def _generate_default_categories(self, texts: List[str], num_clusters: int = 5) -> List[str]:
33
  """
34
  Generate default categories based on text clustering
35
 
 
41
  list: List of category names
42
  """
43
  # Simple implementation - in real system this would be more sophisticated
44
+ default_categories: List[str] = [f"Category {i+1}" for i in range(num_clusters)]
45
  return default_categories
46
 
classifiers/llm.py CHANGED
@@ -6,9 +6,10 @@ from sklearn.metrics.pairwise import cosine_similarity
6
  import random
7
  import json
8
  import asyncio
9
- from typing import List, Dict, Any, Optional
10
  import sys
11
  import os
 
12
 
13
  # Add the project root to the Python path
14
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -20,22 +21,22 @@ from .base import BaseClassifier
20
  class LLMClassifier(BaseClassifier):
21
  """Classifier using a Large Language Model for more accurate but slower classification"""
22
 
23
- def __init__(self, client, model="gpt-3.5-turbo"):
24
  super().__init__()
25
- self.client = client
26
- self.model = model
27
 
28
  async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
29
  """Async version of text classification"""
30
- prompt = TEXT_CLASSIFICATION_PROMPT.format(
31
  categories=", ".join(categories),
32
  text=text
33
  )
34
 
35
  try:
36
  # Use the synchronous client method but run it in a thread pool
37
- loop = asyncio.get_event_loop()
38
- response = await loop.run_in_executor(
39
  None,
40
  lambda: self.client.chat.completions.create(
41
  model=self.model,
@@ -46,8 +47,8 @@ class LLMClassifier(BaseClassifier):
46
  )
47
 
48
  # Parse JSON response
49
- response_text = response.choices[0].message.content.strip()
50
- result = json.loads(response_text)
51
 
52
  # Ensure all required fields are present
53
  if not all(k in result for k in ["category", "confidence", "explanation"]):
@@ -68,7 +69,7 @@ class LLMClassifier(BaseClassifier):
68
  return result
69
  except json.JSONDecodeError:
70
  # Fall back to simple parsing if JSON fails
71
- category = categories[0] # Default
72
  for cat in categories:
73
  if cat.lower() in response_text.lower():
74
  category = cat
@@ -90,16 +91,16 @@ class LLMClassifier(BaseClassifier):
90
  """Async version of category suggestion"""
91
  # Take a sample of texts to avoid token limitations
92
  if len(texts) > sample_size:
93
- sample_texts = random.sample(texts, sample_size)
94
  else:
95
- sample_texts = texts
96
 
97
- prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
98
 
99
  try:
100
  # Use the synchronous client method but run it in a thread pool
101
- loop = asyncio.get_event_loop()
102
- response = await loop.run_in_executor(
103
  None,
104
  lambda: self.client.chat.completions.create(
105
  model=self.model,
@@ -110,8 +111,8 @@ class LLMClassifier(BaseClassifier):
110
  )
111
 
112
  # Parse response to get categories
113
- categories_text = response.choices[0].message.content.strip()
114
- categories = [cat.strip() for cat in categories_text.split(",")]
115
 
116
  return categories
117
  except Exception as e:
@@ -127,10 +128,10 @@ class LLMClassifier(BaseClassifier):
127
  categories = await self._suggest_categories_async(texts)
128
 
129
  # Create tasks for all texts
130
- tasks = [self._classify_text_async(text, categories) for text in texts]
131
 
132
  # Gather all results
133
- results = await asyncio.gather(*tasks)
134
  return results
135
 
136
  def classify(
 
6
  import random
7
  import json
8
  import asyncio
9
+ from typing import List, Dict, Any, Optional, Union
10
  import sys
11
  import os
12
+ from litellm import OpenAI
13
 
14
  # Add the project root to the Python path
15
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
21
  class LLMClassifier(BaseClassifier):
22
  """Classifier using a Large Language Model for more accurate but slower classification"""
23
 
24
+ def __init__(self, client: OpenAI, model: str = "gpt-3.5-turbo") -> None:
25
  super().__init__()
26
+ self.client: OpenAI = client
27
+ self.model: str = model
28
 
29
  async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
30
  """Async version of text classification"""
31
+ prompt: str = TEXT_CLASSIFICATION_PROMPT.format(
32
  categories=", ".join(categories),
33
  text=text
34
  )
35
 
36
  try:
37
  # Use the synchronous client method but run it in a thread pool
38
+ loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
39
+ response: Any = await loop.run_in_executor(
40
  None,
41
  lambda: self.client.chat.completions.create(
42
  model=self.model,
 
47
  )
48
 
49
  # Parse JSON response
50
+ response_text: str = response.choices[0].message.content.strip()
51
+ result: Dict[str, Any] = json.loads(response_text)
52
 
53
  # Ensure all required fields are present
54
  if not all(k in result for k in ["category", "confidence", "explanation"]):
 
69
  return result
70
  except json.JSONDecodeError:
71
  # Fall back to simple parsing if JSON fails
72
+ category: str = categories[0] # Default
73
  for cat in categories:
74
  if cat.lower() in response_text.lower():
75
  category = cat
 
91
  """Async version of category suggestion"""
92
  # Take a sample of texts to avoid token limitations
93
  if len(texts) > sample_size:
94
+ sample_texts: List[str] = random.sample(texts, sample_size)
95
  else:
96
+ sample_texts: List[str] = texts
97
 
98
+ prompt: str = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
99
 
100
  try:
101
  # Use the synchronous client method but run it in a thread pool
102
+ loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
103
+ response: Any = await loop.run_in_executor(
104
  None,
105
  lambda: self.client.chat.completions.create(
106
  model=self.model,
 
111
  )
112
 
113
  # Parse response to get categories
114
+ categories_text: str = response.choices[0].message.content.strip()
115
+ categories: List[str] = [cat.strip() for cat in categories_text.split(",")]
116
 
117
  return categories
118
  except Exception as e:
 
128
  categories = await self._suggest_categories_async(texts)
129
 
130
  # Create tasks for all texts
131
+ tasks: List[asyncio.Task] = [self._classify_text_async(text, categories) for text in texts]
132
 
133
  # Gather all results
134
+ results: List[Dict[str, Any]] = await asyncio.gather(*tasks)
135
  return results
136
 
137
  def classify(
classifiers/tfidf.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import numpy as np
3
  import pandas as pd
4
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -9,6 +8,7 @@ import json
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
  from typing import List, Dict, Any, Optional
11
  from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 
12
 
13
  from .base import BaseClassifier
14
 
@@ -16,25 +16,25 @@ from .base import BaseClassifier
16
  class TFIDFClassifier(BaseClassifier):
17
  """Classifier using TF-IDF and clustering for fast classification"""
18
 
19
- def __init__(self):
20
  super().__init__()
21
- self.vectorizer = TfidfVectorizer(
22
  max_features=1000, stop_words="english", ngram_range=(1, 2)
23
  )
24
- self.model = None
25
- self.feature_names = None
26
- self.categories = None
27
- self.centroids = None
28
 
29
- def classify(self, texts, categories=None):
30
  """Classify texts using TF-IDF and clustering"""
31
  # Vectorize the texts
32
- X = self.vectorizer.fit_transform(texts)
33
  self.feature_names = self.vectorizer.get_feature_names_out()
34
 
35
  # Auto-detect categories if not provided
36
  if not categories:
37
- num_clusters = min(5, len(texts)) # Don't create more clusters than texts
38
  self.categories = self._generate_default_categories(texts, num_clusters)
39
  else:
40
  self.categories = categories
@@ -42,22 +42,22 @@ class TFIDFClassifier(BaseClassifier):
42
 
43
  # Cluster the texts
44
  self.model = KMeans(n_clusters=num_clusters, random_state=42)
45
- clusters = self.model.fit_predict(X)
46
  self.centroids = self.model.cluster_centers_
47
 
48
  # Calculate distances to centroids for confidence
49
- distances = self._calculate_distances(X)
50
 
51
  # Prepare results
52
- results = []
53
  for i, text in enumerate(texts):
54
- cluster_idx = clusters[i]
55
 
56
  # Calculate confidence (inverse of distance, normalized)
57
- confidence = self._calculate_confidence(distances[i])
58
 
59
  # Create explanation
60
- explanation = self._generate_explanation(X[i], cluster_idx)
61
 
62
  results.append(
63
  {
@@ -69,7 +69,7 @@ class TFIDFClassifier(BaseClassifier):
69
 
70
  return results
71
 
72
- def _calculate_distances(self, X):
73
  """Calculate distances from each point to each centroid"""
74
  return np.sqrt(
75
  (
@@ -77,37 +77,37 @@ class TFIDFClassifier(BaseClassifier):
77
  ).sum(axis=2)
78
  )
79
 
80
- def _calculate_confidence(self, distances):
81
  """Convert distances to confidence scores (0-100)"""
82
- min_dist = np.min(distances)
83
- max_dist = np.max(distances)
84
 
85
  # Normalize and invert (smaller distance = higher confidence)
86
  if max_dist == min_dist:
87
  return 70 # Default mid-range confidence when all distances are equal
88
 
89
- normalized_dist = (distances - min_dist) / (max_dist - min_dist)
90
- min_normalized = np.min(normalized_dist)
91
 
92
  # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
93
- confidence = 100 - (min_normalized * 50)
94
  return round(confidence, 1)
95
 
96
- def _generate_explanation(self, text_vector, cluster_idx):
97
  """Generate an explanation for the classification"""
98
  # Get the most important features for this cluster
99
- centroid = self.centroids[cluster_idx]
100
 
101
  # Get indices of top features for this text
102
- text_array = text_vector.toarray()[0]
103
- top_indices = text_array.argsort()[-5:][::-1]
104
 
105
  # Get the feature names for these indices
106
- top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
107
 
108
  if not top_features:
109
  return "No significant features identified for this classification."
110
 
111
- explanation = f"Classification based on key terms: {', '.join(top_features)}"
112
  return explanation
113
 
 
 
1
  import numpy as np
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  from typing import List, Dict, Any, Optional
10
  from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
11
+ from scipy.sparse import csr_matrix
12
 
13
  from .base import BaseClassifier
14
 
 
16
  class TFIDFClassifier(BaseClassifier):
17
  """Classifier using TF-IDF and clustering for fast classification"""
18
 
19
+ def __init__(self) -> None:
20
  super().__init__()
21
+ self.vectorizer: TfidfVectorizer = TfidfVectorizer(
22
  max_features=1000, stop_words="english", ngram_range=(1, 2)
23
  )
24
+ self.model: Optional[KMeans] = None
25
+ self.feature_names: Optional[np.ndarray] = None
26
+ self.categories: Optional[List[str]] = None
27
+ self.centroids: Optional[np.ndarray] = None
28
 
29
+ def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
30
  """Classify texts using TF-IDF and clustering"""
31
  # Vectorize the texts
32
+ X: csr_matrix = self.vectorizer.fit_transform(texts)
33
  self.feature_names = self.vectorizer.get_feature_names_out()
34
 
35
  # Auto-detect categories if not provided
36
  if not categories:
37
+ num_clusters: int = min(5, len(texts)) # Don't create more clusters than texts
38
  self.categories = self._generate_default_categories(texts, num_clusters)
39
  else:
40
  self.categories = categories
 
42
 
43
  # Cluster the texts
44
  self.model = KMeans(n_clusters=num_clusters, random_state=42)
45
+ clusters: np.ndarray = self.model.fit_predict(X)
46
  self.centroids = self.model.cluster_centers_
47
 
48
  # Calculate distances to centroids for confidence
49
+ distances: np.ndarray = self._calculate_distances(X)
50
 
51
  # Prepare results
52
+ results: List[Dict[str, Any]] = []
53
  for i, text in enumerate(texts):
54
+ cluster_idx: int = clusters[i]
55
 
56
  # Calculate confidence (inverse of distance, normalized)
57
+ confidence: float = self._calculate_confidence(distances[i])
58
 
59
  # Create explanation
60
+ explanation: str = self._generate_explanation(X[i], cluster_idx)
61
 
62
  results.append(
63
  {
 
69
 
70
  return results
71
 
72
+ def _calculate_distances(self, X: csr_matrix) -> np.ndarray:
73
  """Calculate distances from each point to each centroid"""
74
  return np.sqrt(
75
  (
 
77
  ).sum(axis=2)
78
  )
79
 
80
+ def _calculate_confidence(self, distances: np.ndarray) -> float:
81
  """Convert distances to confidence scores (0-100)"""
82
+ min_dist: float = np.min(distances)
83
+ max_dist: float = np.max(distances)
84
 
85
  # Normalize and invert (smaller distance = higher confidence)
86
  if max_dist == min_dist:
87
  return 70 # Default mid-range confidence when all distances are equal
88
 
89
+ normalized_dist: np.ndarray = (distances - min_dist) / (max_dist - min_dist)
90
+ min_normalized: float = np.min(normalized_dist)
91
 
92
  # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
93
+ confidence: float = 100 - (min_normalized * 50)
94
  return round(confidence, 1)
95
 
96
+ def _generate_explanation(self, text_vector: csr_matrix, cluster_idx: int) -> str:
97
  """Generate an explanation for the classification"""
98
  # Get the most important features for this cluster
99
+ centroid: np.ndarray = self.centroids[cluster_idx]
100
 
101
  # Get indices of top features for this text
102
+ text_array: np.ndarray = text_vector.toarray()[0]
103
+ top_indices: np.ndarray = text_array.argsort()[-5:][::-1]
104
 
105
  # Get the feature names for these indices
106
+ top_features: List[str] = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
107
 
108
  if not top_features:
109
  return "No significant features identified for this classification."
110
 
111
+ explanation: str = f"Classification based on key terms: {', '.join(top_features)}"
112
  return explanation
113
 
client.py CHANGED
@@ -1,19 +1,20 @@
1
  from litellm import OpenAI
2
  import os
3
  from dotenv import load_dotenv
 
4
 
5
  # Load environment variables
6
  load_dotenv()
7
 
8
  # Initialize client as None
9
- client = None
10
 
11
- def get_client():
12
  """Get the OpenAI client instance"""
13
  global client
14
  return client
15
 
16
- def initialize_client(api_key=None):
17
  """Initialize the OpenAI client with an API key"""
18
  global client
19
  import logging
@@ -28,7 +29,7 @@ def initialize_client(api_key=None):
28
  try:
29
  client = OpenAI(api_key=api_key)
30
  # Test the connection with a simple request
31
- response = client.chat.completions.create(
32
  model="gpt-3.5-turbo",
33
  messages=[{"role": "user", "content": "test"}],
34
  max_tokens=5,
@@ -37,6 +38,6 @@ def initialize_client(api_key=None):
37
  return True, "API Key updated and verified successfully"
38
  except Exception as e:
39
  client = None
40
- error_message = f"Failed to initialize client: {str(e)}"
41
  logging.error(error_message)
42
  return False, error_message
 
1
  from litellm import OpenAI
2
  import os
3
  from dotenv import load_dotenv
4
+ from typing import Optional, Tuple, Any
5
 
6
  # Load environment variables
7
  load_dotenv()
8
 
9
  # Initialize client as None
10
+ client: Optional[OpenAI] = None
11
 
12
+ def get_client() -> Optional[OpenAI]:
13
  """Get the OpenAI client instance"""
14
  global client
15
  return client
16
 
17
+ def initialize_client(api_key: Optional[str] = None) -> Tuple[bool, str]:
18
  """Initialize the OpenAI client with an API key"""
19
  global client
20
  import logging
 
29
  try:
30
  client = OpenAI(api_key=api_key)
31
  # Test the connection with a simple request
32
+ response: Any = client.chat.completions.create(
33
  model="gpt-3.5-turbo",
34
  messages=[{"role": "user", "content": "test"}],
35
  max_tokens=5,
 
38
  return True, "API Key updated and verified successfully"
39
  except Exception as e:
40
  client = None
41
+ error_message: str = f"Failed to initialize client: {str(e)}"
42
  logging.error(error_message)
43
  return False, error_message
process.py CHANGED
@@ -3,36 +3,45 @@ import time
3
  import traceback
4
  import asyncio
5
  from sklearn.feature_extraction.text import TfidfVectorizer
 
 
 
6
 
7
  from classifiers import TFIDFClassifier, LLMClassifier
8
  from utils import load_data, validate_results
9
  from client import get_client
10
 
11
 
12
- def update_api_key(api_key):
13
  """Update the OpenAI API key"""
14
  from client import initialize_client
15
  return initialize_client(api_key)
16
 
17
 
18
- async def process_file_async(file, text_columns, categories, classifier_type, show_explanations):
 
 
 
 
 
 
19
  """Async version of process_file"""
20
  # Initialize result_df and validation_report
21
- result_df = None
22
- validation_report = None
23
 
24
  try:
25
  # Load data from file
26
  if isinstance(file, str):
27
- df = load_data(file)
28
  else:
29
- df = load_data(file.name)
30
 
31
  if not text_columns:
32
  return None, "Please select at least one text column"
33
 
34
  # Check if all selected columns exist
35
- missing_columns = [col for col in text_columns if col not in df.columns]
36
  if missing_columns:
37
  return (
38
  None,
@@ -40,18 +49,18 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
40
  )
41
 
42
  # Combine text from selected columns
43
- texts = []
44
  for _, row in df.iterrows():
45
- combined_text = " ".join(str(row[col]) for col in text_columns)
46
  texts.append(combined_text)
47
 
48
  # Parse categories if provided
49
- category_list = []
50
  if categories:
51
  category_list = [cat.strip() for cat in categories.split(",")]
52
 
53
  # Select classifier based on data size and user choice
54
- num_texts = len(texts)
55
 
56
  # If no specific model is chosen, select the most appropriate one
57
  if classifier_type == "auto":
@@ -69,17 +78,17 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
69
 
70
  # Initialize appropriate classifier
71
  if classifier_type == "tfidf":
72
- classifier = TFIDFClassifier()
73
- results = classifier.classify(texts, category_list)
74
  elif classifier_type in ["gpt35", "gpt4"]:
75
  if client is None:
76
  return (
77
  None,
78
  "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
79
  )
80
- model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
81
- classifier = LLMClassifier(client=client, model=model)
82
- results = await classifier.classify_async(texts, category_list)
83
  else: # hybrid
84
  if client is None:
85
  return (
@@ -87,14 +96,14 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
87
  "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
88
  )
89
  # First pass with TF-IDF
90
- tfidf_classifier = TFIDFClassifier()
91
- tfidf_results = tfidf_classifier.classify(texts, category_list)
92
 
93
  # Second pass with LLM for low confidence results
94
- llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
95
- results = []
96
- low_confidence_texts = []
97
- low_confidence_indices = []
98
 
99
  for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
100
  if tfidf_result["confidence"] < 70: # If confidence is below 70%
@@ -105,7 +114,7 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
105
  results.append(tfidf_result)
106
 
107
  if low_confidence_texts:
108
- llm_results = await llm_classifier.classify_async(
109
  low_confidence_texts, category_list
110
  )
111
  for idx, llm_result in zip(low_confidence_indices, llm_results):
@@ -125,16 +134,22 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
125
  return result_df, validation_report
126
 
127
  except Exception as e:
128
- error_traceback = traceback.format_exc()
129
  return None, f"Error: {str(e)}\n{error_traceback}"
130
 
131
 
132
- def process_file(file, text_columns, categories, classifier_type, show_explanations):
 
 
 
 
 
 
133
  """Synchronous wrapper for process_file_async"""
134
  return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
135
 
136
 
137
- def export_results(df, format_type):
138
  """Export results to a file and return the file path for download"""
139
  if df is None:
140
  return None
@@ -144,18 +159,18 @@ def export_results(df, format_type):
144
  import os
145
 
146
  # Create a temporary directory if it doesn't exist
147
- temp_dir = "temp_exports"
148
  os.makedirs(temp_dir, exist_ok=True)
149
 
150
  # Generate a unique filename
151
- timestamp = time.strftime("%Y%m%d-%H%M%S")
152
- filename = f"classification_results_{timestamp}"
153
 
154
  if format_type == "excel":
155
- file_path = os.path.join(temp_dir, f"{filename}.xlsx")
156
  df.to_excel(file_path, index=False)
157
  else:
158
- file_path = os.path.join(temp_dir, f"{filename}.csv")
159
  df.to_csv(file_path, index=False)
160
 
161
  return file_path
 
3
  import traceback
4
  import asyncio
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from typing import Optional, List, Dict, Any, Tuple, Union
7
+ import pandas as pd
8
+ from pathlib import Path
9
 
10
  from classifiers import TFIDFClassifier, LLMClassifier
11
  from utils import load_data, validate_results
12
  from client import get_client
13
 
14
 
15
+ def update_api_key(api_key: str) -> Tuple[bool, str]:
16
  """Update the OpenAI API key"""
17
  from client import initialize_client
18
  return initialize_client(api_key)
19
 
20
 
21
+ async def process_file_async(
22
+ file: Union[str, Path],
23
+ text_columns: List[str],
24
+ categories: Optional[str],
25
+ classifier_type: str,
26
+ show_explanations: bool
27
+ ) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
28
  """Async version of process_file"""
29
  # Initialize result_df and validation_report
30
+ result_df: Optional[pd.DataFrame] = None
31
+ validation_report: Optional[str] = None
32
 
33
  try:
34
  # Load data from file
35
  if isinstance(file, str):
36
+ df: pd.DataFrame = load_data(file)
37
  else:
38
+ df: pd.DataFrame = load_data(file.name)
39
 
40
  if not text_columns:
41
  return None, "Please select at least one text column"
42
 
43
  # Check if all selected columns exist
44
+ missing_columns: List[str] = [col for col in text_columns if col not in df.columns]
45
  if missing_columns:
46
  return (
47
  None,
 
49
  )
50
 
51
  # Combine text from selected columns
52
+ texts: List[str] = []
53
  for _, row in df.iterrows():
54
+ combined_text: str = " ".join(str(row[col]) for col in text_columns)
55
  texts.append(combined_text)
56
 
57
  # Parse categories if provided
58
+ category_list: List[str] = []
59
  if categories:
60
  category_list = [cat.strip() for cat in categories.split(",")]
61
 
62
  # Select classifier based on data size and user choice
63
+ num_texts: int = len(texts)
64
 
65
  # If no specific model is chosen, select the most appropriate one
66
  if classifier_type == "auto":
 
78
 
79
  # Initialize appropriate classifier
80
  if classifier_type == "tfidf":
81
+ classifier: TFIDFClassifier = TFIDFClassifier()
82
+ results: List[Dict[str, Any]] = classifier.classify(texts, category_list)
83
  elif classifier_type in ["gpt35", "gpt4"]:
84
  if client is None:
85
  return (
86
  None,
87
  "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
88
  )
89
+ model: str = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
90
+ classifier: LLMClassifier = LLMClassifier(client=client, model=model)
91
+ results: List[Dict[str, Any]] = await classifier.classify_async(texts, category_list)
92
  else: # hybrid
93
  if client is None:
94
  return (
 
96
  "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
97
  )
98
  # First pass with TF-IDF
99
+ tfidf_classifier: TFIDFClassifier = TFIDFClassifier()
100
+ tfidf_results: List[Dict[str, Any]] = tfidf_classifier.classify(texts, category_list)
101
 
102
  # Second pass with LLM for low confidence results
103
+ llm_classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
104
+ results: List[Optional[Dict[str, Any]]] = []
105
+ low_confidence_texts: List[str] = []
106
+ low_confidence_indices: List[int] = []
107
 
108
  for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
109
  if tfidf_result["confidence"] < 70: # If confidence is below 70%
 
114
  results.append(tfidf_result)
115
 
116
  if low_confidence_texts:
117
+ llm_results: List[Dict[str, Any]] = await llm_classifier.classify_async(
118
  low_confidence_texts, category_list
119
  )
120
  for idx, llm_result in zip(low_confidence_indices, llm_results):
 
134
  return result_df, validation_report
135
 
136
  except Exception as e:
137
+ error_traceback: str = traceback.format_exc()
138
  return None, f"Error: {str(e)}\n{error_traceback}"
139
 
140
 
141
+ def process_file(
142
+ file: Union[str, Path],
143
+ text_columns: List[str],
144
+ categories: Optional[str],
145
+ classifier_type: str,
146
+ show_explanations: bool
147
+ ) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
148
  """Synchronous wrapper for process_file_async"""
149
  return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
150
 
151
 
152
+ def export_results(df: pd.DataFrame, format_type: str) -> Optional[str]:
153
  """Export results to a file and return the file path for download"""
154
  if df is None:
155
  return None
 
159
  import os
160
 
161
  # Create a temporary directory if it doesn't exist
162
+ temp_dir: str = "temp_exports"
163
  os.makedirs(temp_dir, exist_ok=True)
164
 
165
  # Generate a unique filename
166
+ timestamp: str = time.strftime("%Y%m%d-%H%M%S")
167
+ filename: str = f"classification_results_{timestamp}"
168
 
169
  if format_type == "excel":
170
+ file_path: str = os.path.join(temp_dir, f"{filename}.xlsx")
171
  df.to_excel(file_path, index=False)
172
  else:
173
+ file_path: str = os.path.join(temp_dir, f"{filename}.csv")
174
  df.to_csv(file_path, index=False)
175
 
176
  return file_path
server.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
- from typing import List, Optional
5
  import json
6
  from classifiers.llm import LLMClassifier
7
  from litellm import completion
@@ -13,7 +13,7 @@ from dotenv import load_dotenv
13
  # Load environment variables
14
  load_dotenv()
15
 
16
- app = FastAPI()
17
 
18
  # Configure CORS
19
  app.add_middleware(
@@ -25,8 +25,10 @@ app.add_middleware(
25
  )
26
 
27
  # Initialize client with API key from environment
28
- api_key = os.environ.get("OPENAI_API_KEY")
29
  if api_key:
 
 
30
  success, message = initialize_client(api_key)
31
  if not success:
32
  raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
@@ -36,7 +38,7 @@ if not client:
36
  raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
37
 
38
  # Initialize the LLM classifier
39
- classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
40
 
41
  class TextInput(BaseModel):
42
  text: str
@@ -51,14 +53,14 @@ class CategorySuggestionResponse(BaseModel):
51
  categories: List[str]
52
 
53
  @app.post("/classify", response_model=ClassificationResponse)
54
- async def classify_text(text_input: TextInput):
55
  try:
56
  # Use async classification
57
- results = await classifier.classify_async(
58
  [text_input.text],
59
  text_input.categories
60
  )
61
- result = results[0] # Get first result since we're classifying one text
62
 
63
  return ClassificationResponse(
64
  category=result["category"],
@@ -69,9 +71,9 @@ async def classify_text(text_input: TextInput):
69
  raise HTTPException(status_code=500, detail=str(e))
70
 
71
  @app.post("/suggest-categories", response_model=CategorySuggestionResponse)
72
- async def suggest_categories(texts: List[str]):
73
  try:
74
- categories = await classifier._suggest_categories_async(texts)
75
  return CategorySuggestionResponse(categories=categories)
76
  except Exception as e:
77
  raise HTTPException(status_code=500, detail=str(e))
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
+ from typing import List, Optional, Dict, Any, Tuple
5
  import json
6
  from classifiers.llm import LLMClassifier
7
  from litellm import completion
 
13
  # Load environment variables
14
  load_dotenv()
15
 
16
+ app: FastAPI = FastAPI()
17
 
18
  # Configure CORS
19
  app.add_middleware(
 
25
  )
26
 
27
  # Initialize client with API key from environment
28
+ api_key: Optional[str] = os.environ.get("OPENAI_API_KEY")
29
  if api_key:
30
+ success: bool
31
+ message: str
32
  success, message = initialize_client(api_key)
33
  if not success:
34
  raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
 
38
  raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
39
 
40
  # Initialize the LLM classifier
41
+ classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
42
 
43
  class TextInput(BaseModel):
44
  text: str
 
53
  categories: List[str]
54
 
55
  @app.post("/classify", response_model=ClassificationResponse)
56
+ async def classify_text(text_input: TextInput) -> ClassificationResponse:
57
  try:
58
  # Use async classification
59
+ results: List[Dict[str, Any]] = await classifier.classify_async(
60
  [text_input.text],
61
  text_input.categories
62
  )
63
+ result: Dict[str, Any] = results[0] # Get first result since we're classifying one text
64
 
65
  return ClassificationResponse(
66
  category=result["category"],
 
71
  raise HTTPException(status_code=500, detail=str(e))
72
 
73
  @app.post("/suggest-categories", response_model=CategorySuggestionResponse)
74
+ async def suggest_categories(texts: List[str]) -> CategorySuggestionResponse:
75
  try:
76
+ categories: List[str] = await classifier._suggest_categories_async(texts)
77
  return CategorySuggestionResponse(categories=categories)
78
  except Exception as e:
79
  raise HTTPException(status_code=500, detail=str(e))
test_server.py CHANGED
@@ -1,13 +1,14 @@
1
  import requests
2
  import json
 
3
 
4
- BASE_URL = "http://localhost:8000"
5
 
6
- def test_classify_text():
7
  # Load emails from CSV file
8
  import csv
9
 
10
- emails = []
11
  with open("examples/emails.csv", "r", encoding="utf-8") as file:
12
  reader = csv.DictReader(file)
13
  for row in reader:
@@ -15,7 +16,7 @@ def test_classify_text():
15
 
16
  # Test with default categories using email content
17
  for email in emails[:5]:
18
- response = requests.post(
19
  f"{BASE_URL}/classify",
20
  json={"text": email["contenu"]}
21
  )
@@ -23,11 +24,11 @@ def test_classify_text():
23
  print(json.dumps(response.json(), indent=2))
24
 
25
 
26
- def test_suggest_categories():
27
  # Load reviews from CSV file
28
  import csv
29
 
30
- texts = []
31
  with open("examples/reviews.csv", "r", encoding="utf-8") as file:
32
  reader = csv.DictReader(file)
33
  for row in reader:
@@ -35,7 +36,7 @@ def test_suggest_categories():
35
 
36
  # Use the first few reviews for testing
37
  texts = texts[:5]
38
- response = requests.post(
39
  f"{BASE_URL}/suggest-categories",
40
  json=texts
41
  )
 
1
  import requests
2
  import json
3
+ from typing import List, Dict, Any, Optional
4
 
5
+ BASE_URL: str = "http://localhost:8000"
6
 
7
+ def test_classify_text() -> None:
8
  # Load emails from CSV file
9
  import csv
10
 
11
+ emails: List[Dict[str, str]] = []
12
  with open("examples/emails.csv", "r", encoding="utf-8") as file:
13
  reader = csv.DictReader(file)
14
  for row in reader:
 
16
 
17
  # Test with default categories using email content
18
  for email in emails[:5]:
19
+ response: requests.Response = requests.post(
20
  f"{BASE_URL}/classify",
21
  json={"text": email["contenu"]}
22
  )
 
24
  print(json.dumps(response.json(), indent=2))
25
 
26
 
27
+ def test_suggest_categories() -> None:
28
  # Load reviews from CSV file
29
  import csv
30
 
31
+ texts: List[str] = []
32
  with open("examples/reviews.csv", "r", encoding="utf-8") as file:
33
  reader = csv.DictReader(file)
34
  for row in reader:
 
36
 
37
  # Use the first few reviews for testing
38
  texts = texts[:5]
39
+ response: requests.Response = requests.post(
40
  f"{BASE_URL}/suggest-categories",
41
  json=texts
42
  )
utils.py CHANGED
@@ -6,9 +6,12 @@ from sklearn.decomposition import PCA
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  import tempfile
8
  from prompts import VALIDATION_PROMPT
 
 
 
9
 
10
 
11
- def load_data(file_path):
12
  """
13
  Load data from an Excel or CSV file
14
 
@@ -18,7 +21,7 @@ def load_data(file_path):
18
  Returns:
19
  pd.DataFrame: Loaded data
20
  """
21
- file_ext = os.path.splitext(file_path)[1].lower()
22
 
23
  if file_ext == ".xlsx" or file_ext == ".xls":
24
  return pd.read_excel(file_path)
@@ -30,7 +33,7 @@ def load_data(file_path):
30
  )
31
 
32
 
33
- def export_data(df, file_name, format_type="excel"):
34
  """
35
  Export dataframe to file
36
 
@@ -43,11 +46,11 @@ def export_data(df, file_name, format_type="excel"):
43
  str: Path to the exported file
44
  """
45
  # Create export directory if it doesn't exist
46
- export_dir = "exports"
47
  os.makedirs(export_dir, exist_ok=True)
48
 
49
  # Full path for the export file
50
- export_path = os.path.join(export_dir, file_name)
51
 
52
  # Export based on format type
53
  if format_type == "excel":
@@ -58,7 +61,7 @@ def export_data(df, file_name, format_type="excel"):
58
  return export_path
59
 
60
 
61
- def visualize_results(df, text_column, category_column="Category"):
62
  """
63
  Create visualization of classification results
64
 
@@ -73,6 +76,8 @@ def visualize_results(df, text_column, category_column="Category"):
73
  # Check if category column exists
74
  if category_column not in df.columns:
75
  # Create a simple figure with a message
 
 
76
  fig, ax = plt.subplots(figsize=(10, 6))
77
  ax.text(
78
  0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
@@ -82,17 +87,19 @@ def visualize_results(df, text_column, category_column="Category"):
82
  return fig
83
 
84
  # Get categories and their counts
85
- category_counts = df[category_column].value_counts()
86
 
87
  # Create a new figure
 
 
88
  fig, ax = plt.subplots(figsize=(10, 6))
89
 
90
  # Create the histogram
91
- bars = ax.bar(category_counts.index, category_counts.values)
92
 
93
  # Add value labels on top of each bar
94
  for bar in bars:
95
- height = bar.get_height()
96
  ax.text(
97
  bar.get_x() + bar.get_width() / 2.0,
98
  height,
@@ -117,7 +124,7 @@ def visualize_results(df, text_column, category_column="Category"):
117
  return fig
118
 
119
 
120
- def validate_results(df, text_columns, client):
121
  """
122
  Use LLM to validate the classification results
123
 
@@ -131,33 +138,33 @@ def validate_results(df, text_columns, client):
131
  """
132
  try:
133
  # Sample a few rows for validation
134
- sample_size = min(5, len(df))
135
- sample_df = df.sample(n=sample_size, random_state=42)
136
 
137
  # Build validation prompts
138
- validation_prompts = []
139
  for _, row in sample_df.iterrows():
140
  # Combine text from all selected columns
141
- text = " ".join(str(row[col]) for col in text_columns)
142
- assigned_category = row["Category"]
143
- confidence = row["Confidence"]
144
 
145
  validation_prompts.append(
146
  f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
147
  )
148
 
149
  # Use the prompt from prompts.py
150
- prompt = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
151
 
152
  # Call LLM API
153
- response = client.chat.completions.create(
154
  model="gpt-3.5-turbo",
155
  messages=[{"role": "user", "content": prompt}],
156
  temperature=0.3,
157
  max_tokens=400,
158
  )
159
 
160
- validation_report = response.choices[0].message.content.strip()
161
  return validation_report
162
 
163
  except Exception as e:
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  import tempfile
8
  from prompts import VALIDATION_PROMPT
9
+ from typing import List, Optional, Any, Union
10
+ from pathlib import Path
11
+ from matplotlib.figure import Figure
12
 
13
 
14
+ def load_data(file_path: Union[str, Path]) -> pd.DataFrame:
15
  """
16
  Load data from an Excel or CSV file
17
 
 
21
  Returns:
22
  pd.DataFrame: Loaded data
23
  """
24
+ file_ext: str = os.path.splitext(file_path)[1].lower()
25
 
26
  if file_ext == ".xlsx" or file_ext == ".xls":
27
  return pd.read_excel(file_path)
 
33
  )
34
 
35
 
36
+ def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str:
37
  """
38
  Export dataframe to file
39
 
 
46
  str: Path to the exported file
47
  """
48
  # Create export directory if it doesn't exist
49
+ export_dir: str = "exports"
50
  os.makedirs(export_dir, exist_ok=True)
51
 
52
  # Full path for the export file
53
+ export_path: str = os.path.join(export_dir, file_name)
54
 
55
  # Export based on format type
56
  if format_type == "excel":
 
61
  return export_path
62
 
63
 
64
+ def visualize_results(df: pd.DataFrame, text_column: str, category_column: str = "Category") -> Figure:
65
  """
66
  Create visualization of classification results
67
 
 
76
  # Check if category column exists
77
  if category_column not in df.columns:
78
  # Create a simple figure with a message
79
+ fig: Figure
80
+ ax: Any
81
  fig, ax = plt.subplots(figsize=(10, 6))
82
  ax.text(
83
  0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
 
87
  return fig
88
 
89
  # Get categories and their counts
90
+ category_counts: pd.Series = df[category_column].value_counts()
91
 
92
  # Create a new figure
93
+ fig: Figure
94
+ ax: Any
95
  fig, ax = plt.subplots(figsize=(10, 6))
96
 
97
  # Create the histogram
98
+ bars: Any = ax.bar(category_counts.index, category_counts.values)
99
 
100
  # Add value labels on top of each bar
101
  for bar in bars:
102
+ height: float = bar.get_height()
103
  ax.text(
104
  bar.get_x() + bar.get_width() / 2.0,
105
  height,
 
124
  return fig
125
 
126
 
127
+ def validate_results(df: pd.DataFrame, text_columns: List[str], client: Any) -> str:
128
  """
129
  Use LLM to validate the classification results
130
 
 
138
  """
139
  try:
140
  # Sample a few rows for validation
141
+ sample_size: int = min(5, len(df))
142
+ sample_df: pd.DataFrame = df.sample(n=sample_size, random_state=42)
143
 
144
  # Build validation prompts
145
+ validation_prompts: List[str] = []
146
  for _, row in sample_df.iterrows():
147
  # Combine text from all selected columns
148
+ text: str = " ".join(str(row[col]) for col in text_columns)
149
+ assigned_category: str = row["Category"]
150
+ confidence: float = row["Confidence"]
151
 
152
  validation_prompts.append(
153
  f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
154
  )
155
 
156
  # Use the prompt from prompts.py
157
+ prompt: str = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
158
 
159
  # Call LLM API
160
+ response: Any = client.chat.completions.create(
161
  model="gpt-3.5-turbo",
162
  messages=[{"role": "user", "content": prompt}],
163
  temperature=0.3,
164
  max_tokens=400,
165
  )
166
 
167
+ validation_report: str = response.choices[0].message.content.strip()
168
  return validation_report
169
 
170
  except Exception as e: