Spaces:
Sleeping
Sleeping
add tpes
Browse files- classifiers/base.py +4 -6
- classifiers/llm.py +20 -19
- classifiers/tfidf.py +29 -29
- client.py +6 -5
- process.py +46 -31
- server.py +11 -9
- test_server.py +8 -7
- utils.py +26 -19
classifiers/base.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
@@ -15,10 +13,10 @@ from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
|
|
15 |
class BaseClassifier:
|
16 |
"""Base class for text classifiers"""
|
17 |
|
18 |
-
def __init__(self):
|
19 |
pass
|
20 |
|
21 |
-
def classify(self, texts, categories=None):
|
22 |
"""
|
23 |
Classify a list of texts into categories
|
24 |
|
@@ -31,7 +29,7 @@ class BaseClassifier:
|
|
31 |
"""
|
32 |
raise NotImplementedError("Subclasses must implement this method")
|
33 |
|
34 |
-
def _generate_default_categories(self, texts, num_clusters=5):
|
35 |
"""
|
36 |
Generate default categories based on text clustering
|
37 |
|
@@ -43,6 +41,6 @@ class BaseClassifier:
|
|
43 |
list: List of category names
|
44 |
"""
|
45 |
# Simple implementation - in real system this would be more sophisticated
|
46 |
-
default_categories = [f"Category {i+1}" for i in range(num_clusters)]
|
47 |
return default_categories
|
48 |
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
13 |
class BaseClassifier:
|
14 |
"""Base class for text classifiers"""
|
15 |
|
16 |
+
def __init__(self) -> None:
|
17 |
pass
|
18 |
|
19 |
+
def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
20 |
"""
|
21 |
Classify a list of texts into categories
|
22 |
|
|
|
29 |
"""
|
30 |
raise NotImplementedError("Subclasses must implement this method")
|
31 |
|
32 |
+
def _generate_default_categories(self, texts: List[str], num_clusters: int = 5) -> List[str]:
|
33 |
"""
|
34 |
Generate default categories based on text clustering
|
35 |
|
|
|
41 |
list: List of category names
|
42 |
"""
|
43 |
# Simple implementation - in real system this would be more sophisticated
|
44 |
+
default_categories: List[str] = [f"Category {i+1}" for i in range(num_clusters)]
|
45 |
return default_categories
|
46 |
|
classifiers/llm.py
CHANGED
@@ -6,9 +6,10 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
6 |
import random
|
7 |
import json
|
8 |
import asyncio
|
9 |
-
from typing import List, Dict, Any, Optional
|
10 |
import sys
|
11 |
import os
|
|
|
12 |
|
13 |
# Add the project root to the Python path
|
14 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
@@ -20,22 +21,22 @@ from .base import BaseClassifier
|
|
20 |
class LLMClassifier(BaseClassifier):
|
21 |
"""Classifier using a Large Language Model for more accurate but slower classification"""
|
22 |
|
23 |
-
def __init__(self, client, model="gpt-3.5-turbo"):
|
24 |
super().__init__()
|
25 |
-
self.client = client
|
26 |
-
self.model = model
|
27 |
|
28 |
async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
29 |
"""Async version of text classification"""
|
30 |
-
prompt = TEXT_CLASSIFICATION_PROMPT.format(
|
31 |
categories=", ".join(categories),
|
32 |
text=text
|
33 |
)
|
34 |
|
35 |
try:
|
36 |
# Use the synchronous client method but run it in a thread pool
|
37 |
-
loop = asyncio.get_event_loop()
|
38 |
-
response = await loop.run_in_executor(
|
39 |
None,
|
40 |
lambda: self.client.chat.completions.create(
|
41 |
model=self.model,
|
@@ -46,8 +47,8 @@ class LLMClassifier(BaseClassifier):
|
|
46 |
)
|
47 |
|
48 |
# Parse JSON response
|
49 |
-
response_text = response.choices[0].message.content.strip()
|
50 |
-
result = json.loads(response_text)
|
51 |
|
52 |
# Ensure all required fields are present
|
53 |
if not all(k in result for k in ["category", "confidence", "explanation"]):
|
@@ -68,7 +69,7 @@ class LLMClassifier(BaseClassifier):
|
|
68 |
return result
|
69 |
except json.JSONDecodeError:
|
70 |
# Fall back to simple parsing if JSON fails
|
71 |
-
category = categories[0] # Default
|
72 |
for cat in categories:
|
73 |
if cat.lower() in response_text.lower():
|
74 |
category = cat
|
@@ -90,16 +91,16 @@ class LLMClassifier(BaseClassifier):
|
|
90 |
"""Async version of category suggestion"""
|
91 |
# Take a sample of texts to avoid token limitations
|
92 |
if len(texts) > sample_size:
|
93 |
-
sample_texts = random.sample(texts, sample_size)
|
94 |
else:
|
95 |
-
sample_texts = texts
|
96 |
|
97 |
-
prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
|
98 |
|
99 |
try:
|
100 |
# Use the synchronous client method but run it in a thread pool
|
101 |
-
loop = asyncio.get_event_loop()
|
102 |
-
response = await loop.run_in_executor(
|
103 |
None,
|
104 |
lambda: self.client.chat.completions.create(
|
105 |
model=self.model,
|
@@ -110,8 +111,8 @@ class LLMClassifier(BaseClassifier):
|
|
110 |
)
|
111 |
|
112 |
# Parse response to get categories
|
113 |
-
categories_text = response.choices[0].message.content.strip()
|
114 |
-
categories = [cat.strip() for cat in categories_text.split(",")]
|
115 |
|
116 |
return categories
|
117 |
except Exception as e:
|
@@ -127,10 +128,10 @@ class LLMClassifier(BaseClassifier):
|
|
127 |
categories = await self._suggest_categories_async(texts)
|
128 |
|
129 |
# Create tasks for all texts
|
130 |
-
tasks = [self._classify_text_async(text, categories) for text in texts]
|
131 |
|
132 |
# Gather all results
|
133 |
-
results = await asyncio.gather(*tasks)
|
134 |
return results
|
135 |
|
136 |
def classify(
|
|
|
6 |
import random
|
7 |
import json
|
8 |
import asyncio
|
9 |
+
from typing import List, Dict, Any, Optional, Union
|
10 |
import sys
|
11 |
import os
|
12 |
+
from litellm import OpenAI
|
13 |
|
14 |
# Add the project root to the Python path
|
15 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
21 |
class LLMClassifier(BaseClassifier):
|
22 |
"""Classifier using a Large Language Model for more accurate but slower classification"""
|
23 |
|
24 |
+
def __init__(self, client: OpenAI, model: str = "gpt-3.5-turbo") -> None:
|
25 |
super().__init__()
|
26 |
+
self.client: OpenAI = client
|
27 |
+
self.model: str = model
|
28 |
|
29 |
async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
30 |
"""Async version of text classification"""
|
31 |
+
prompt: str = TEXT_CLASSIFICATION_PROMPT.format(
|
32 |
categories=", ".join(categories),
|
33 |
text=text
|
34 |
)
|
35 |
|
36 |
try:
|
37 |
# Use the synchronous client method but run it in a thread pool
|
38 |
+
loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
|
39 |
+
response: Any = await loop.run_in_executor(
|
40 |
None,
|
41 |
lambda: self.client.chat.completions.create(
|
42 |
model=self.model,
|
|
|
47 |
)
|
48 |
|
49 |
# Parse JSON response
|
50 |
+
response_text: str = response.choices[0].message.content.strip()
|
51 |
+
result: Dict[str, Any] = json.loads(response_text)
|
52 |
|
53 |
# Ensure all required fields are present
|
54 |
if not all(k in result for k in ["category", "confidence", "explanation"]):
|
|
|
69 |
return result
|
70 |
except json.JSONDecodeError:
|
71 |
# Fall back to simple parsing if JSON fails
|
72 |
+
category: str = categories[0] # Default
|
73 |
for cat in categories:
|
74 |
if cat.lower() in response_text.lower():
|
75 |
category = cat
|
|
|
91 |
"""Async version of category suggestion"""
|
92 |
# Take a sample of texts to avoid token limitations
|
93 |
if len(texts) > sample_size:
|
94 |
+
sample_texts: List[str] = random.sample(texts, sample_size)
|
95 |
else:
|
96 |
+
sample_texts: List[str] = texts
|
97 |
|
98 |
+
prompt: str = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
|
99 |
|
100 |
try:
|
101 |
# Use the synchronous client method but run it in a thread pool
|
102 |
+
loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
|
103 |
+
response: Any = await loop.run_in_executor(
|
104 |
None,
|
105 |
lambda: self.client.chat.completions.create(
|
106 |
model=self.model,
|
|
|
111 |
)
|
112 |
|
113 |
# Parse response to get categories
|
114 |
+
categories_text: str = response.choices[0].message.content.strip()
|
115 |
+
categories: List[str] = [cat.strip() for cat in categories_text.split(",")]
|
116 |
|
117 |
return categories
|
118 |
except Exception as e:
|
|
|
128 |
categories = await self._suggest_categories_async(texts)
|
129 |
|
130 |
# Create tasks for all texts
|
131 |
+
tasks: List[asyncio.Task] = [self._classify_text_async(text, categories) for text in texts]
|
132 |
|
133 |
# Gather all results
|
134 |
+
results: List[Dict[str, Any]] = await asyncio.gather(*tasks)
|
135 |
return results
|
136 |
|
137 |
def classify(
|
classifiers/tfidf.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
@@ -9,6 +8,7 @@ import json
|
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
from typing import List, Dict, Any, Optional
|
11 |
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
|
|
|
12 |
|
13 |
from .base import BaseClassifier
|
14 |
|
@@ -16,25 +16,25 @@ from .base import BaseClassifier
|
|
16 |
class TFIDFClassifier(BaseClassifier):
|
17 |
"""Classifier using TF-IDF and clustering for fast classification"""
|
18 |
|
19 |
-
def __init__(self):
|
20 |
super().__init__()
|
21 |
-
self.vectorizer = TfidfVectorizer(
|
22 |
max_features=1000, stop_words="english", ngram_range=(1, 2)
|
23 |
)
|
24 |
-
self.model = None
|
25 |
-
self.feature_names = None
|
26 |
-
self.categories = None
|
27 |
-
self.centroids = None
|
28 |
|
29 |
-
def classify(self, texts, categories=None):
|
30 |
"""Classify texts using TF-IDF and clustering"""
|
31 |
# Vectorize the texts
|
32 |
-
X = self.vectorizer.fit_transform(texts)
|
33 |
self.feature_names = self.vectorizer.get_feature_names_out()
|
34 |
|
35 |
# Auto-detect categories if not provided
|
36 |
if not categories:
|
37 |
-
num_clusters = min(5, len(texts)) # Don't create more clusters than texts
|
38 |
self.categories = self._generate_default_categories(texts, num_clusters)
|
39 |
else:
|
40 |
self.categories = categories
|
@@ -42,22 +42,22 @@ class TFIDFClassifier(BaseClassifier):
|
|
42 |
|
43 |
# Cluster the texts
|
44 |
self.model = KMeans(n_clusters=num_clusters, random_state=42)
|
45 |
-
clusters = self.model.fit_predict(X)
|
46 |
self.centroids = self.model.cluster_centers_
|
47 |
|
48 |
# Calculate distances to centroids for confidence
|
49 |
-
distances = self._calculate_distances(X)
|
50 |
|
51 |
# Prepare results
|
52 |
-
results = []
|
53 |
for i, text in enumerate(texts):
|
54 |
-
cluster_idx = clusters[i]
|
55 |
|
56 |
# Calculate confidence (inverse of distance, normalized)
|
57 |
-
confidence = self._calculate_confidence(distances[i])
|
58 |
|
59 |
# Create explanation
|
60 |
-
explanation = self._generate_explanation(X[i], cluster_idx)
|
61 |
|
62 |
results.append(
|
63 |
{
|
@@ -69,7 +69,7 @@ class TFIDFClassifier(BaseClassifier):
|
|
69 |
|
70 |
return results
|
71 |
|
72 |
-
def _calculate_distances(self, X):
|
73 |
"""Calculate distances from each point to each centroid"""
|
74 |
return np.sqrt(
|
75 |
(
|
@@ -77,37 +77,37 @@ class TFIDFClassifier(BaseClassifier):
|
|
77 |
).sum(axis=2)
|
78 |
)
|
79 |
|
80 |
-
def _calculate_confidence(self, distances):
|
81 |
"""Convert distances to confidence scores (0-100)"""
|
82 |
-
min_dist = np.min(distances)
|
83 |
-
max_dist = np.max(distances)
|
84 |
|
85 |
# Normalize and invert (smaller distance = higher confidence)
|
86 |
if max_dist == min_dist:
|
87 |
return 70 # Default mid-range confidence when all distances are equal
|
88 |
|
89 |
-
normalized_dist = (distances - min_dist) / (max_dist - min_dist)
|
90 |
-
min_normalized = np.min(normalized_dist)
|
91 |
|
92 |
# Invert and scale to 50-100 range (TF-IDF is never 100% confident)
|
93 |
-
confidence = 100 - (min_normalized * 50)
|
94 |
return round(confidence, 1)
|
95 |
|
96 |
-
def _generate_explanation(self, text_vector, cluster_idx):
|
97 |
"""Generate an explanation for the classification"""
|
98 |
# Get the most important features for this cluster
|
99 |
-
centroid = self.centroids[cluster_idx]
|
100 |
|
101 |
# Get indices of top features for this text
|
102 |
-
text_array = text_vector.toarray()[0]
|
103 |
-
top_indices = text_array.argsort()[-5:][::-1]
|
104 |
|
105 |
# Get the feature names for these indices
|
106 |
-
top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
|
107 |
|
108 |
if not top_features:
|
109 |
return "No significant features identified for this classification."
|
110 |
|
111 |
-
explanation = f"Classification based on key terms: {', '.join(top_features)}"
|
112 |
return explanation
|
113 |
|
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
from typing import List, Dict, Any, Optional
|
10 |
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
|
11 |
+
from scipy.sparse import csr_matrix
|
12 |
|
13 |
from .base import BaseClassifier
|
14 |
|
|
|
16 |
class TFIDFClassifier(BaseClassifier):
|
17 |
"""Classifier using TF-IDF and clustering for fast classification"""
|
18 |
|
19 |
+
def __init__(self) -> None:
|
20 |
super().__init__()
|
21 |
+
self.vectorizer: TfidfVectorizer = TfidfVectorizer(
|
22 |
max_features=1000, stop_words="english", ngram_range=(1, 2)
|
23 |
)
|
24 |
+
self.model: Optional[KMeans] = None
|
25 |
+
self.feature_names: Optional[np.ndarray] = None
|
26 |
+
self.categories: Optional[List[str]] = None
|
27 |
+
self.centroids: Optional[np.ndarray] = None
|
28 |
|
29 |
+
def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
30 |
"""Classify texts using TF-IDF and clustering"""
|
31 |
# Vectorize the texts
|
32 |
+
X: csr_matrix = self.vectorizer.fit_transform(texts)
|
33 |
self.feature_names = self.vectorizer.get_feature_names_out()
|
34 |
|
35 |
# Auto-detect categories if not provided
|
36 |
if not categories:
|
37 |
+
num_clusters: int = min(5, len(texts)) # Don't create more clusters than texts
|
38 |
self.categories = self._generate_default_categories(texts, num_clusters)
|
39 |
else:
|
40 |
self.categories = categories
|
|
|
42 |
|
43 |
# Cluster the texts
|
44 |
self.model = KMeans(n_clusters=num_clusters, random_state=42)
|
45 |
+
clusters: np.ndarray = self.model.fit_predict(X)
|
46 |
self.centroids = self.model.cluster_centers_
|
47 |
|
48 |
# Calculate distances to centroids for confidence
|
49 |
+
distances: np.ndarray = self._calculate_distances(X)
|
50 |
|
51 |
# Prepare results
|
52 |
+
results: List[Dict[str, Any]] = []
|
53 |
for i, text in enumerate(texts):
|
54 |
+
cluster_idx: int = clusters[i]
|
55 |
|
56 |
# Calculate confidence (inverse of distance, normalized)
|
57 |
+
confidence: float = self._calculate_confidence(distances[i])
|
58 |
|
59 |
# Create explanation
|
60 |
+
explanation: str = self._generate_explanation(X[i], cluster_idx)
|
61 |
|
62 |
results.append(
|
63 |
{
|
|
|
69 |
|
70 |
return results
|
71 |
|
72 |
+
def _calculate_distances(self, X: csr_matrix) -> np.ndarray:
|
73 |
"""Calculate distances from each point to each centroid"""
|
74 |
return np.sqrt(
|
75 |
(
|
|
|
77 |
).sum(axis=2)
|
78 |
)
|
79 |
|
80 |
+
def _calculate_confidence(self, distances: np.ndarray) -> float:
|
81 |
"""Convert distances to confidence scores (0-100)"""
|
82 |
+
min_dist: float = np.min(distances)
|
83 |
+
max_dist: float = np.max(distances)
|
84 |
|
85 |
# Normalize and invert (smaller distance = higher confidence)
|
86 |
if max_dist == min_dist:
|
87 |
return 70 # Default mid-range confidence when all distances are equal
|
88 |
|
89 |
+
normalized_dist: np.ndarray = (distances - min_dist) / (max_dist - min_dist)
|
90 |
+
min_normalized: float = np.min(normalized_dist)
|
91 |
|
92 |
# Invert and scale to 50-100 range (TF-IDF is never 100% confident)
|
93 |
+
confidence: float = 100 - (min_normalized * 50)
|
94 |
return round(confidence, 1)
|
95 |
|
96 |
+
def _generate_explanation(self, text_vector: csr_matrix, cluster_idx: int) -> str:
|
97 |
"""Generate an explanation for the classification"""
|
98 |
# Get the most important features for this cluster
|
99 |
+
centroid: np.ndarray = self.centroids[cluster_idx]
|
100 |
|
101 |
# Get indices of top features for this text
|
102 |
+
text_array: np.ndarray = text_vector.toarray()[0]
|
103 |
+
top_indices: np.ndarray = text_array.argsort()[-5:][::-1]
|
104 |
|
105 |
# Get the feature names for these indices
|
106 |
+
top_features: List[str] = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
|
107 |
|
108 |
if not top_features:
|
109 |
return "No significant features identified for this classification."
|
110 |
|
111 |
+
explanation: str = f"Classification based on key terms: {', '.join(top_features)}"
|
112 |
return explanation
|
113 |
|
client.py
CHANGED
@@ -1,19 +1,20 @@
|
|
1 |
from litellm import OpenAI
|
2 |
import os
|
3 |
from dotenv import load_dotenv
|
|
|
4 |
|
5 |
# Load environment variables
|
6 |
load_dotenv()
|
7 |
|
8 |
# Initialize client as None
|
9 |
-
client = None
|
10 |
|
11 |
-
def get_client():
|
12 |
"""Get the OpenAI client instance"""
|
13 |
global client
|
14 |
return client
|
15 |
|
16 |
-
def initialize_client(api_key=None):
|
17 |
"""Initialize the OpenAI client with an API key"""
|
18 |
global client
|
19 |
import logging
|
@@ -28,7 +29,7 @@ def initialize_client(api_key=None):
|
|
28 |
try:
|
29 |
client = OpenAI(api_key=api_key)
|
30 |
# Test the connection with a simple request
|
31 |
-
response = client.chat.completions.create(
|
32 |
model="gpt-3.5-turbo",
|
33 |
messages=[{"role": "user", "content": "test"}],
|
34 |
max_tokens=5,
|
@@ -37,6 +38,6 @@ def initialize_client(api_key=None):
|
|
37 |
return True, "API Key updated and verified successfully"
|
38 |
except Exception as e:
|
39 |
client = None
|
40 |
-
error_message = f"Failed to initialize client: {str(e)}"
|
41 |
logging.error(error_message)
|
42 |
return False, error_message
|
|
|
1 |
from litellm import OpenAI
|
2 |
import os
|
3 |
from dotenv import load_dotenv
|
4 |
+
from typing import Optional, Tuple, Any
|
5 |
|
6 |
# Load environment variables
|
7 |
load_dotenv()
|
8 |
|
9 |
# Initialize client as None
|
10 |
+
client: Optional[OpenAI] = None
|
11 |
|
12 |
+
def get_client() -> Optional[OpenAI]:
|
13 |
"""Get the OpenAI client instance"""
|
14 |
global client
|
15 |
return client
|
16 |
|
17 |
+
def initialize_client(api_key: Optional[str] = None) -> Tuple[bool, str]:
|
18 |
"""Initialize the OpenAI client with an API key"""
|
19 |
global client
|
20 |
import logging
|
|
|
29 |
try:
|
30 |
client = OpenAI(api_key=api_key)
|
31 |
# Test the connection with a simple request
|
32 |
+
response: Any = client.chat.completions.create(
|
33 |
model="gpt-3.5-turbo",
|
34 |
messages=[{"role": "user", "content": "test"}],
|
35 |
max_tokens=5,
|
|
|
38 |
return True, "API Key updated and verified successfully"
|
39 |
except Exception as e:
|
40 |
client = None
|
41 |
+
error_message: str = f"Failed to initialize client: {str(e)}"
|
42 |
logging.error(error_message)
|
43 |
return False, error_message
|
process.py
CHANGED
@@ -3,36 +3,45 @@ import time
|
|
3 |
import traceback
|
4 |
import asyncio
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
|
6 |
|
7 |
from classifiers import TFIDFClassifier, LLMClassifier
|
8 |
from utils import load_data, validate_results
|
9 |
from client import get_client
|
10 |
|
11 |
|
12 |
-
def update_api_key(api_key):
|
13 |
"""Update the OpenAI API key"""
|
14 |
from client import initialize_client
|
15 |
return initialize_client(api_key)
|
16 |
|
17 |
|
18 |
-
async def process_file_async(
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"""Async version of process_file"""
|
20 |
# Initialize result_df and validation_report
|
21 |
-
result_df = None
|
22 |
-
validation_report = None
|
23 |
|
24 |
try:
|
25 |
# Load data from file
|
26 |
if isinstance(file, str):
|
27 |
-
df = load_data(file)
|
28 |
else:
|
29 |
-
df = load_data(file.name)
|
30 |
|
31 |
if not text_columns:
|
32 |
return None, "Please select at least one text column"
|
33 |
|
34 |
# Check if all selected columns exist
|
35 |
-
missing_columns = [col for col in text_columns if col not in df.columns]
|
36 |
if missing_columns:
|
37 |
return (
|
38 |
None,
|
@@ -40,18 +49,18 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
|
|
40 |
)
|
41 |
|
42 |
# Combine text from selected columns
|
43 |
-
texts = []
|
44 |
for _, row in df.iterrows():
|
45 |
-
combined_text = " ".join(str(row[col]) for col in text_columns)
|
46 |
texts.append(combined_text)
|
47 |
|
48 |
# Parse categories if provided
|
49 |
-
category_list = []
|
50 |
if categories:
|
51 |
category_list = [cat.strip() for cat in categories.split(",")]
|
52 |
|
53 |
# Select classifier based on data size and user choice
|
54 |
-
num_texts = len(texts)
|
55 |
|
56 |
# If no specific model is chosen, select the most appropriate one
|
57 |
if classifier_type == "auto":
|
@@ -69,17 +78,17 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
|
|
69 |
|
70 |
# Initialize appropriate classifier
|
71 |
if classifier_type == "tfidf":
|
72 |
-
classifier = TFIDFClassifier()
|
73 |
-
results = classifier.classify(texts, category_list)
|
74 |
elif classifier_type in ["gpt35", "gpt4"]:
|
75 |
if client is None:
|
76 |
return (
|
77 |
None,
|
78 |
"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
|
79 |
)
|
80 |
-
model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
|
81 |
-
classifier = LLMClassifier(client=client, model=model)
|
82 |
-
results = await classifier.classify_async(texts, category_list)
|
83 |
else: # hybrid
|
84 |
if client is None:
|
85 |
return (
|
@@ -87,14 +96,14 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
|
|
87 |
"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
|
88 |
)
|
89 |
# First pass with TF-IDF
|
90 |
-
tfidf_classifier = TFIDFClassifier()
|
91 |
-
tfidf_results = tfidf_classifier.classify(texts, category_list)
|
92 |
|
93 |
# Second pass with LLM for low confidence results
|
94 |
-
llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
95 |
-
results = []
|
96 |
-
low_confidence_texts = []
|
97 |
-
low_confidence_indices = []
|
98 |
|
99 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
100 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
@@ -105,7 +114,7 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
|
|
105 |
results.append(tfidf_result)
|
106 |
|
107 |
if low_confidence_texts:
|
108 |
-
llm_results = await llm_classifier.classify_async(
|
109 |
low_confidence_texts, category_list
|
110 |
)
|
111 |
for idx, llm_result in zip(low_confidence_indices, llm_results):
|
@@ -125,16 +134,22 @@ async def process_file_async(file, text_columns, categories, classifier_type, sh
|
|
125 |
return result_df, validation_report
|
126 |
|
127 |
except Exception as e:
|
128 |
-
error_traceback = traceback.format_exc()
|
129 |
return None, f"Error: {str(e)}\n{error_traceback}"
|
130 |
|
131 |
|
132 |
-
def process_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
"""Synchronous wrapper for process_file_async"""
|
134 |
return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
|
135 |
|
136 |
|
137 |
-
def export_results(df, format_type):
|
138 |
"""Export results to a file and return the file path for download"""
|
139 |
if df is None:
|
140 |
return None
|
@@ -144,18 +159,18 @@ def export_results(df, format_type):
|
|
144 |
import os
|
145 |
|
146 |
# Create a temporary directory if it doesn't exist
|
147 |
-
temp_dir = "temp_exports"
|
148 |
os.makedirs(temp_dir, exist_ok=True)
|
149 |
|
150 |
# Generate a unique filename
|
151 |
-
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
152 |
-
filename = f"classification_results_{timestamp}"
|
153 |
|
154 |
if format_type == "excel":
|
155 |
-
file_path = os.path.join(temp_dir, f"{filename}.xlsx")
|
156 |
df.to_excel(file_path, index=False)
|
157 |
else:
|
158 |
-
file_path = os.path.join(temp_dir, f"{filename}.csv")
|
159 |
df.to_csv(file_path, index=False)
|
160 |
|
161 |
return file_path
|
|
|
3 |
import traceback
|
4 |
import asyncio
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from typing import Optional, List, Dict, Any, Tuple, Union
|
7 |
+
import pandas as pd
|
8 |
+
from pathlib import Path
|
9 |
|
10 |
from classifiers import TFIDFClassifier, LLMClassifier
|
11 |
from utils import load_data, validate_results
|
12 |
from client import get_client
|
13 |
|
14 |
|
15 |
+
def update_api_key(api_key: str) -> Tuple[bool, str]:
|
16 |
"""Update the OpenAI API key"""
|
17 |
from client import initialize_client
|
18 |
return initialize_client(api_key)
|
19 |
|
20 |
|
21 |
+
async def process_file_async(
|
22 |
+
file: Union[str, Path],
|
23 |
+
text_columns: List[str],
|
24 |
+
categories: Optional[str],
|
25 |
+
classifier_type: str,
|
26 |
+
show_explanations: bool
|
27 |
+
) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
28 |
"""Async version of process_file"""
|
29 |
# Initialize result_df and validation_report
|
30 |
+
result_df: Optional[pd.DataFrame] = None
|
31 |
+
validation_report: Optional[str] = None
|
32 |
|
33 |
try:
|
34 |
# Load data from file
|
35 |
if isinstance(file, str):
|
36 |
+
df: pd.DataFrame = load_data(file)
|
37 |
else:
|
38 |
+
df: pd.DataFrame = load_data(file.name)
|
39 |
|
40 |
if not text_columns:
|
41 |
return None, "Please select at least one text column"
|
42 |
|
43 |
# Check if all selected columns exist
|
44 |
+
missing_columns: List[str] = [col for col in text_columns if col not in df.columns]
|
45 |
if missing_columns:
|
46 |
return (
|
47 |
None,
|
|
|
49 |
)
|
50 |
|
51 |
# Combine text from selected columns
|
52 |
+
texts: List[str] = []
|
53 |
for _, row in df.iterrows():
|
54 |
+
combined_text: str = " ".join(str(row[col]) for col in text_columns)
|
55 |
texts.append(combined_text)
|
56 |
|
57 |
# Parse categories if provided
|
58 |
+
category_list: List[str] = []
|
59 |
if categories:
|
60 |
category_list = [cat.strip() for cat in categories.split(",")]
|
61 |
|
62 |
# Select classifier based on data size and user choice
|
63 |
+
num_texts: int = len(texts)
|
64 |
|
65 |
# If no specific model is chosen, select the most appropriate one
|
66 |
if classifier_type == "auto":
|
|
|
78 |
|
79 |
# Initialize appropriate classifier
|
80 |
if classifier_type == "tfidf":
|
81 |
+
classifier: TFIDFClassifier = TFIDFClassifier()
|
82 |
+
results: List[Dict[str, Any]] = classifier.classify(texts, category_list)
|
83 |
elif classifier_type in ["gpt35", "gpt4"]:
|
84 |
if client is None:
|
85 |
return (
|
86 |
None,
|
87 |
"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
|
88 |
)
|
89 |
+
model: str = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
|
90 |
+
classifier: LLMClassifier = LLMClassifier(client=client, model=model)
|
91 |
+
results: List[Dict[str, Any]] = await classifier.classify_async(texts, category_list)
|
92 |
else: # hybrid
|
93 |
if client is None:
|
94 |
return (
|
|
|
96 |
"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
|
97 |
)
|
98 |
# First pass with TF-IDF
|
99 |
+
tfidf_classifier: TFIDFClassifier = TFIDFClassifier()
|
100 |
+
tfidf_results: List[Dict[str, Any]] = tfidf_classifier.classify(texts, category_list)
|
101 |
|
102 |
# Second pass with LLM for low confidence results
|
103 |
+
llm_classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
104 |
+
results: List[Optional[Dict[str, Any]]] = []
|
105 |
+
low_confidence_texts: List[str] = []
|
106 |
+
low_confidence_indices: List[int] = []
|
107 |
|
108 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
109 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
|
|
114 |
results.append(tfidf_result)
|
115 |
|
116 |
if low_confidence_texts:
|
117 |
+
llm_results: List[Dict[str, Any]] = await llm_classifier.classify_async(
|
118 |
low_confidence_texts, category_list
|
119 |
)
|
120 |
for idx, llm_result in zip(low_confidence_indices, llm_results):
|
|
|
134 |
return result_df, validation_report
|
135 |
|
136 |
except Exception as e:
|
137 |
+
error_traceback: str = traceback.format_exc()
|
138 |
return None, f"Error: {str(e)}\n{error_traceback}"
|
139 |
|
140 |
|
141 |
+
def process_file(
|
142 |
+
file: Union[str, Path],
|
143 |
+
text_columns: List[str],
|
144 |
+
categories: Optional[str],
|
145 |
+
classifier_type: str,
|
146 |
+
show_explanations: bool
|
147 |
+
) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
148 |
"""Synchronous wrapper for process_file_async"""
|
149 |
return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
|
150 |
|
151 |
|
152 |
+
def export_results(df: pd.DataFrame, format_type: str) -> Optional[str]:
|
153 |
"""Export results to a file and return the file path for download"""
|
154 |
if df is None:
|
155 |
return None
|
|
|
159 |
import os
|
160 |
|
161 |
# Create a temporary directory if it doesn't exist
|
162 |
+
temp_dir: str = "temp_exports"
|
163 |
os.makedirs(temp_dir, exist_ok=True)
|
164 |
|
165 |
# Generate a unique filename
|
166 |
+
timestamp: str = time.strftime("%Y%m%d-%H%M%S")
|
167 |
+
filename: str = f"classification_results_{timestamp}"
|
168 |
|
169 |
if format_type == "excel":
|
170 |
+
file_path: str = os.path.join(temp_dir, f"{filename}.xlsx")
|
171 |
df.to_excel(file_path, index=False)
|
172 |
else:
|
173 |
+
file_path: str = os.path.join(temp_dir, f"{filename}.csv")
|
174 |
df.to_csv(file_path, index=False)
|
175 |
|
176 |
return file_path
|
server.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from pydantic import BaseModel
|
4 |
-
from typing import List, Optional
|
5 |
import json
|
6 |
from classifiers.llm import LLMClassifier
|
7 |
from litellm import completion
|
@@ -13,7 +13,7 @@ from dotenv import load_dotenv
|
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
15 |
|
16 |
-
app = FastAPI()
|
17 |
|
18 |
# Configure CORS
|
19 |
app.add_middleware(
|
@@ -25,8 +25,10 @@ app.add_middleware(
|
|
25 |
)
|
26 |
|
27 |
# Initialize client with API key from environment
|
28 |
-
api_key = os.environ.get("OPENAI_API_KEY")
|
29 |
if api_key:
|
|
|
|
|
30 |
success, message = initialize_client(api_key)
|
31 |
if not success:
|
32 |
raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
|
@@ -36,7 +38,7 @@ if not client:
|
|
36 |
raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
|
37 |
|
38 |
# Initialize the LLM classifier
|
39 |
-
classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
40 |
|
41 |
class TextInput(BaseModel):
|
42 |
text: str
|
@@ -51,14 +53,14 @@ class CategorySuggestionResponse(BaseModel):
|
|
51 |
categories: List[str]
|
52 |
|
53 |
@app.post("/classify", response_model=ClassificationResponse)
|
54 |
-
async def classify_text(text_input: TextInput):
|
55 |
try:
|
56 |
# Use async classification
|
57 |
-
results = await classifier.classify_async(
|
58 |
[text_input.text],
|
59 |
text_input.categories
|
60 |
)
|
61 |
-
result = results[0] # Get first result since we're classifying one text
|
62 |
|
63 |
return ClassificationResponse(
|
64 |
category=result["category"],
|
@@ -69,9 +71,9 @@ async def classify_text(text_input: TextInput):
|
|
69 |
raise HTTPException(status_code=500, detail=str(e))
|
70 |
|
71 |
@app.post("/suggest-categories", response_model=CategorySuggestionResponse)
|
72 |
-
async def suggest_categories(texts: List[str]):
|
73 |
try:
|
74 |
-
categories = await classifier._suggest_categories_async(texts)
|
75 |
return CategorySuggestionResponse(categories=categories)
|
76 |
except Exception as e:
|
77 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from pydantic import BaseModel
|
4 |
+
from typing import List, Optional, Dict, Any, Tuple
|
5 |
import json
|
6 |
from classifiers.llm import LLMClassifier
|
7 |
from litellm import completion
|
|
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
15 |
|
16 |
+
app: FastAPI = FastAPI()
|
17 |
|
18 |
# Configure CORS
|
19 |
app.add_middleware(
|
|
|
25 |
)
|
26 |
|
27 |
# Initialize client with API key from environment
|
28 |
+
api_key: Optional[str] = os.environ.get("OPENAI_API_KEY")
|
29 |
if api_key:
|
30 |
+
success: bool
|
31 |
+
message: str
|
32 |
success, message = initialize_client(api_key)
|
33 |
if not success:
|
34 |
raise RuntimeError(f"Failed to initialize OpenAI client: {message}")
|
|
|
38 |
raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY environment variable.")
|
39 |
|
40 |
# Initialize the LLM classifier
|
41 |
+
classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
42 |
|
43 |
class TextInput(BaseModel):
|
44 |
text: str
|
|
|
53 |
categories: List[str]
|
54 |
|
55 |
@app.post("/classify", response_model=ClassificationResponse)
|
56 |
+
async def classify_text(text_input: TextInput) -> ClassificationResponse:
|
57 |
try:
|
58 |
# Use async classification
|
59 |
+
results: List[Dict[str, Any]] = await classifier.classify_async(
|
60 |
[text_input.text],
|
61 |
text_input.categories
|
62 |
)
|
63 |
+
result: Dict[str, Any] = results[0] # Get first result since we're classifying one text
|
64 |
|
65 |
return ClassificationResponse(
|
66 |
category=result["category"],
|
|
|
71 |
raise HTTPException(status_code=500, detail=str(e))
|
72 |
|
73 |
@app.post("/suggest-categories", response_model=CategorySuggestionResponse)
|
74 |
+
async def suggest_categories(texts: List[str]) -> CategorySuggestionResponse:
|
75 |
try:
|
76 |
+
categories: List[str] = await classifier._suggest_categories_async(texts)
|
77 |
return CategorySuggestionResponse(categories=categories)
|
78 |
except Exception as e:
|
79 |
raise HTTPException(status_code=500, detail=str(e))
|
test_server.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
import requests
|
2 |
import json
|
|
|
3 |
|
4 |
-
BASE_URL = "http://localhost:8000"
|
5 |
|
6 |
-
def test_classify_text():
|
7 |
# Load emails from CSV file
|
8 |
import csv
|
9 |
|
10 |
-
emails = []
|
11 |
with open("examples/emails.csv", "r", encoding="utf-8") as file:
|
12 |
reader = csv.DictReader(file)
|
13 |
for row in reader:
|
@@ -15,7 +16,7 @@ def test_classify_text():
|
|
15 |
|
16 |
# Test with default categories using email content
|
17 |
for email in emails[:5]:
|
18 |
-
response = requests.post(
|
19 |
f"{BASE_URL}/classify",
|
20 |
json={"text": email["contenu"]}
|
21 |
)
|
@@ -23,11 +24,11 @@ def test_classify_text():
|
|
23 |
print(json.dumps(response.json(), indent=2))
|
24 |
|
25 |
|
26 |
-
def test_suggest_categories():
|
27 |
# Load reviews from CSV file
|
28 |
import csv
|
29 |
|
30 |
-
texts = []
|
31 |
with open("examples/reviews.csv", "r", encoding="utf-8") as file:
|
32 |
reader = csv.DictReader(file)
|
33 |
for row in reader:
|
@@ -35,7 +36,7 @@ def test_suggest_categories():
|
|
35 |
|
36 |
# Use the first few reviews for testing
|
37 |
texts = texts[:5]
|
38 |
-
response = requests.post(
|
39 |
f"{BASE_URL}/suggest-categories",
|
40 |
json=texts
|
41 |
)
|
|
|
1 |
import requests
|
2 |
import json
|
3 |
+
from typing import List, Dict, Any, Optional
|
4 |
|
5 |
+
BASE_URL: str = "http://localhost:8000"
|
6 |
|
7 |
+
def test_classify_text() -> None:
|
8 |
# Load emails from CSV file
|
9 |
import csv
|
10 |
|
11 |
+
emails: List[Dict[str, str]] = []
|
12 |
with open("examples/emails.csv", "r", encoding="utf-8") as file:
|
13 |
reader = csv.DictReader(file)
|
14 |
for row in reader:
|
|
|
16 |
|
17 |
# Test with default categories using email content
|
18 |
for email in emails[:5]:
|
19 |
+
response: requests.Response = requests.post(
|
20 |
f"{BASE_URL}/classify",
|
21 |
json={"text": email["contenu"]}
|
22 |
)
|
|
|
24 |
print(json.dumps(response.json(), indent=2))
|
25 |
|
26 |
|
27 |
+
def test_suggest_categories() -> None:
|
28 |
# Load reviews from CSV file
|
29 |
import csv
|
30 |
|
31 |
+
texts: List[str] = []
|
32 |
with open("examples/reviews.csv", "r", encoding="utf-8") as file:
|
33 |
reader = csv.DictReader(file)
|
34 |
for row in reader:
|
|
|
36 |
|
37 |
# Use the first few reviews for testing
|
38 |
texts = texts[:5]
|
39 |
+
response: requests.Response = requests.post(
|
40 |
f"{BASE_URL}/suggest-categories",
|
41 |
json=texts
|
42 |
)
|
utils.py
CHANGED
@@ -6,9 +6,12 @@ from sklearn.decomposition import PCA
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
import tempfile
|
8 |
from prompts import VALIDATION_PROMPT
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
-
def load_data(file_path):
|
12 |
"""
|
13 |
Load data from an Excel or CSV file
|
14 |
|
@@ -18,7 +21,7 @@ def load_data(file_path):
|
|
18 |
Returns:
|
19 |
pd.DataFrame: Loaded data
|
20 |
"""
|
21 |
-
file_ext = os.path.splitext(file_path)[1].lower()
|
22 |
|
23 |
if file_ext == ".xlsx" or file_ext == ".xls":
|
24 |
return pd.read_excel(file_path)
|
@@ -30,7 +33,7 @@ def load_data(file_path):
|
|
30 |
)
|
31 |
|
32 |
|
33 |
-
def export_data(df, file_name, format_type="excel"):
|
34 |
"""
|
35 |
Export dataframe to file
|
36 |
|
@@ -43,11 +46,11 @@ def export_data(df, file_name, format_type="excel"):
|
|
43 |
str: Path to the exported file
|
44 |
"""
|
45 |
# Create export directory if it doesn't exist
|
46 |
-
export_dir = "exports"
|
47 |
os.makedirs(export_dir, exist_ok=True)
|
48 |
|
49 |
# Full path for the export file
|
50 |
-
export_path = os.path.join(export_dir, file_name)
|
51 |
|
52 |
# Export based on format type
|
53 |
if format_type == "excel":
|
@@ -58,7 +61,7 @@ def export_data(df, file_name, format_type="excel"):
|
|
58 |
return export_path
|
59 |
|
60 |
|
61 |
-
def visualize_results(df, text_column, category_column="Category"):
|
62 |
"""
|
63 |
Create visualization of classification results
|
64 |
|
@@ -73,6 +76,8 @@ def visualize_results(df, text_column, category_column="Category"):
|
|
73 |
# Check if category column exists
|
74 |
if category_column not in df.columns:
|
75 |
# Create a simple figure with a message
|
|
|
|
|
76 |
fig, ax = plt.subplots(figsize=(10, 6))
|
77 |
ax.text(
|
78 |
0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
|
@@ -82,17 +87,19 @@ def visualize_results(df, text_column, category_column="Category"):
|
|
82 |
return fig
|
83 |
|
84 |
# Get categories and their counts
|
85 |
-
category_counts = df[category_column].value_counts()
|
86 |
|
87 |
# Create a new figure
|
|
|
|
|
88 |
fig, ax = plt.subplots(figsize=(10, 6))
|
89 |
|
90 |
# Create the histogram
|
91 |
-
bars = ax.bar(category_counts.index, category_counts.values)
|
92 |
|
93 |
# Add value labels on top of each bar
|
94 |
for bar in bars:
|
95 |
-
height = bar.get_height()
|
96 |
ax.text(
|
97 |
bar.get_x() + bar.get_width() / 2.0,
|
98 |
height,
|
@@ -117,7 +124,7 @@ def visualize_results(df, text_column, category_column="Category"):
|
|
117 |
return fig
|
118 |
|
119 |
|
120 |
-
def validate_results(df, text_columns, client):
|
121 |
"""
|
122 |
Use LLM to validate the classification results
|
123 |
|
@@ -131,33 +138,33 @@ def validate_results(df, text_columns, client):
|
|
131 |
"""
|
132 |
try:
|
133 |
# Sample a few rows for validation
|
134 |
-
sample_size = min(5, len(df))
|
135 |
-
sample_df = df.sample(n=sample_size, random_state=42)
|
136 |
|
137 |
# Build validation prompts
|
138 |
-
validation_prompts = []
|
139 |
for _, row in sample_df.iterrows():
|
140 |
# Combine text from all selected columns
|
141 |
-
text = " ".join(str(row[col]) for col in text_columns)
|
142 |
-
assigned_category = row["Category"]
|
143 |
-
confidence = row["Confidence"]
|
144 |
|
145 |
validation_prompts.append(
|
146 |
f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
|
147 |
)
|
148 |
|
149 |
# Use the prompt from prompts.py
|
150 |
-
prompt = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
|
151 |
|
152 |
# Call LLM API
|
153 |
-
response = client.chat.completions.create(
|
154 |
model="gpt-3.5-turbo",
|
155 |
messages=[{"role": "user", "content": prompt}],
|
156 |
temperature=0.3,
|
157 |
max_tokens=400,
|
158 |
)
|
159 |
|
160 |
-
validation_report = response.choices[0].message.content.strip()
|
161 |
return validation_report
|
162 |
|
163 |
except Exception as e:
|
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
import tempfile
|
8 |
from prompts import VALIDATION_PROMPT
|
9 |
+
from typing import List, Optional, Any, Union
|
10 |
+
from pathlib import Path
|
11 |
+
from matplotlib.figure import Figure
|
12 |
|
13 |
|
14 |
+
def load_data(file_path: Union[str, Path]) -> pd.DataFrame:
|
15 |
"""
|
16 |
Load data from an Excel or CSV file
|
17 |
|
|
|
21 |
Returns:
|
22 |
pd.DataFrame: Loaded data
|
23 |
"""
|
24 |
+
file_ext: str = os.path.splitext(file_path)[1].lower()
|
25 |
|
26 |
if file_ext == ".xlsx" or file_ext == ".xls":
|
27 |
return pd.read_excel(file_path)
|
|
|
33 |
)
|
34 |
|
35 |
|
36 |
+
def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str:
|
37 |
"""
|
38 |
Export dataframe to file
|
39 |
|
|
|
46 |
str: Path to the exported file
|
47 |
"""
|
48 |
# Create export directory if it doesn't exist
|
49 |
+
export_dir: str = "exports"
|
50 |
os.makedirs(export_dir, exist_ok=True)
|
51 |
|
52 |
# Full path for the export file
|
53 |
+
export_path: str = os.path.join(export_dir, file_name)
|
54 |
|
55 |
# Export based on format type
|
56 |
if format_type == "excel":
|
|
|
61 |
return export_path
|
62 |
|
63 |
|
64 |
+
def visualize_results(df: pd.DataFrame, text_column: str, category_column: str = "Category") -> Figure:
|
65 |
"""
|
66 |
Create visualization of classification results
|
67 |
|
|
|
76 |
# Check if category column exists
|
77 |
if category_column not in df.columns:
|
78 |
# Create a simple figure with a message
|
79 |
+
fig: Figure
|
80 |
+
ax: Any
|
81 |
fig, ax = plt.subplots(figsize=(10, 6))
|
82 |
ax.text(
|
83 |
0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
|
|
|
87 |
return fig
|
88 |
|
89 |
# Get categories and their counts
|
90 |
+
category_counts: pd.Series = df[category_column].value_counts()
|
91 |
|
92 |
# Create a new figure
|
93 |
+
fig: Figure
|
94 |
+
ax: Any
|
95 |
fig, ax = plt.subplots(figsize=(10, 6))
|
96 |
|
97 |
# Create the histogram
|
98 |
+
bars: Any = ax.bar(category_counts.index, category_counts.values)
|
99 |
|
100 |
# Add value labels on top of each bar
|
101 |
for bar in bars:
|
102 |
+
height: float = bar.get_height()
|
103 |
ax.text(
|
104 |
bar.get_x() + bar.get_width() / 2.0,
|
105 |
height,
|
|
|
124 |
return fig
|
125 |
|
126 |
|
127 |
+
def validate_results(df: pd.DataFrame, text_columns: List[str], client: Any) -> str:
|
128 |
"""
|
129 |
Use LLM to validate the classification results
|
130 |
|
|
|
138 |
"""
|
139 |
try:
|
140 |
# Sample a few rows for validation
|
141 |
+
sample_size: int = min(5, len(df))
|
142 |
+
sample_df: pd.DataFrame = df.sample(n=sample_size, random_state=42)
|
143 |
|
144 |
# Build validation prompts
|
145 |
+
validation_prompts: List[str] = []
|
146 |
for _, row in sample_df.iterrows():
|
147 |
# Combine text from all selected columns
|
148 |
+
text: str = " ".join(str(row[col]) for col in text_columns)
|
149 |
+
assigned_category: str = row["Category"]
|
150 |
+
confidence: float = row["Confidence"]
|
151 |
|
152 |
validation_prompts.append(
|
153 |
f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
|
154 |
)
|
155 |
|
156 |
# Use the prompt from prompts.py
|
157 |
+
prompt: str = VALIDATION_PROMPT.format("\n---\n".join(validation_prompts))
|
158 |
|
159 |
# Call LLM API
|
160 |
+
response: Any = client.chat.completions.create(
|
161 |
model="gpt-3.5-turbo",
|
162 |
messages=[{"role": "user", "content": prompt}],
|
163 |
temperature=0.3,
|
164 |
max_tokens=400,
|
165 |
)
|
166 |
|
167 |
+
validation_report: str = response.choices[0].message.content.strip()
|
168 |
return validation_report
|
169 |
|
170 |
except Exception as e:
|