Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on Apr 15

Commit

522275f

1 Parent(s): a2b53c6

add server

Browse files

Files changed (4) hide show

classifiers.py +0 -267
requirements.txt +95 -9
server.py +64 -0
test_server.py +43 -0

classifiers.py DELETED Viewed

@@ -1,267 +0,0 @@
-import numpy as np
-import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.cluster import KMeans
-from sklearn.metrics.pairwise import cosine_similarity
-import random
-import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List, Dict, Any, Optional
-from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
-class BaseClassifier:
-    """Base class for text classifiers"""
-    def __init__(self):
-        pass
-    def classify(self, texts, categories=None):
-        """
-        Classify a list of texts into categories
-        Args:
-            texts (list): List of text strings to classify
-            categories (list, optional): List of category names. If None, categories will be auto-detected
-        Returns:
-            list: List of classification results with categories, confidence scores, and explanations
-        """
-        raise NotImplementedError("Subclasses must implement this method")
-    def _generate_default_categories(self, texts, num_clusters=5):
-        """
-        Generate default categories based on text clustering
-        Args:
-            texts (list): List of text strings
-            num_clusters (int): Number of clusters to generate
-        Returns:
-            list: List of category names
-        """
-        # Simple implementation - in real system this would be more sophisticated
-        default_categories = [f"Category {i+1}" for i in range(num_clusters)]
-        return default_categories
-class TFIDFClassifier(BaseClassifier):
-    """Classifier using TF-IDF and clustering for fast classification"""
-    def __init__(self):
-        super().__init__()
-        self.vectorizer = TfidfVectorizer(
-            max_features=1000, stop_words="english", ngram_range=(1, 2)
-        )
-        self.model = None
-        self.feature_names = None
-        self.categories = None
-        self.centroids = None
-    def classify(self, texts, categories=None):
-        """Classify texts using TF-IDF and clustering"""
-        # Vectorize the texts
-        X = self.vectorizer.fit_transform(texts)
-        self.feature_names = self.vectorizer.get_feature_names_out()
-        # Auto-detect categories if not provided
-        if not categories:
-            num_clusters = min(5, len(texts))  # Don't create more clusters than texts
-            self.categories = self._generate_default_categories(texts, num_clusters)
-        else:
-            self.categories = categories
-            num_clusters = len(categories)
-        # Cluster the texts
-        self.model = KMeans(n_clusters=num_clusters, random_state=42)
-        clusters = self.model.fit_predict(X)
-        self.centroids = self.model.cluster_centers_
-        # Calculate distances to centroids for confidence
-        distances = self._calculate_distances(X)
-        # Prepare results
-        results = []
-        for i, text in enumerate(texts):
-            cluster_idx = clusters[i]
-            # Calculate confidence (inverse of distance, normalized)
-            confidence = self._calculate_confidence(distances[i])
-            # Create explanation
-            explanation = self._generate_explanation(X[i], cluster_idx)
-            results.append(
-                {
-                    "category": self.categories[cluster_idx],
-                    "confidence": confidence,
-                    "explanation": explanation,
-                }
-            )
-        return results
-    def _calculate_distances(self, X):
-        """Calculate distances from each point to each centroid"""
-        return np.sqrt(
-            (
-                (X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2
-            ).sum(axis=2)
-        )
-    def _calculate_confidence(self, distances):
-        """Convert distances to confidence scores (0-100)"""
-        min_dist = np.min(distances)
-        max_dist = np.max(distances)
-        # Normalize and invert (smaller distance = higher confidence)
-        if max_dist == min_dist:
-            return 70  # Default mid-range confidence when all distances are equal
-        normalized_dist = (distances - min_dist) / (max_dist - min_dist)
-        min_normalized = np.min(normalized_dist)
-        # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
-        confidence = 100 - (min_normalized * 50)
-        return round(confidence, 1)
-    def _generate_explanation(self, text_vector, cluster_idx):
-        """Generate an explanation for the classification"""
-        # Get the most important features for this cluster
-        centroid = self.centroids[cluster_idx]
-        # Get indices of top features for this text
-        text_array = text_vector.toarray()[0]
-        top_indices = text_array.argsort()[-5:][::-1]
-        # Get the feature names for these indices
-        top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
-        if not top_features:
-            return "No significant features identified for this classification."
-        explanation = f"Classification based on key terms: {', '.join(top_features)}"
-        return explanation
-class LLMClassifier(BaseClassifier):
-    """Classifier using a Large Language Model for more accurate but slower classification"""
-    def __init__(self, client, model="gpt-3.5-turbo"):
-        super().__init__()
-        self.client = client
-        self.model = model
-    def classify(
-        self, texts: List[str], categories: Optional[List[str]] = None
-    ) -> List[Dict[str, Any]]:
-        """Classify texts using an LLM with parallel processing"""
-        if not categories:
-            # First, use LLM to generate appropriate categories
-            categories = self._suggest_categories(texts)
-        # Process texts in parallel
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            # Submit all tasks with their original indices
-            future_to_index = {
-                executor.submit(self._classify_text, text, categories): idx
-                for idx, text in enumerate(texts)
-            }
-            # Initialize results list with None values
-            results = [None] * len(texts)
-            # Collect results as they complete
-            for future in as_completed(future_to_index):
-                original_idx = future_to_index[future]
-                try:
-                    result = future.result()
-                    results[original_idx] = result
-                except Exception as e:
-                    print(f"Error processing text: {str(e)}")
-                    results[original_idx] = {
-                        "category": categories[0],
-                        "confidence": 50,
-                        "explanation": f"Error during classification: {str(e)}",
-                    }
-        return results
-    def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
-        """Use LLM to suggest appropriate categories for the dataset"""
-        # Take a sample of texts to avoid token limitations
-        if len(texts) > sample_size:
-            sample_texts = random.sample(texts, sample_size)
-        else:
-            sample_texts = texts
-        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
-        try:
-            response = self.client.chat.completions.create(
-                model=self.model,
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.2,
-                max_tokens=100,
-            )
-            # Parse response to get categories
-            categories_text = response.choices[0].message.content.strip()
-            categories = [cat.strip() for cat in categories_text.split(",")]
-            return categories
-        except Exception as e:
-            # Fallback to default categories on error
-            print(f"Error suggesting categories: {str(e)}")
-            return self._generate_default_categories(texts)
-    def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
-        """Use LLM to classify a single text"""
-        prompt = TEXT_CLASSIFICATION_PROMPT.format(
-            categories=", ".join(categories), text=text
-        )
-        try:
-            response = self.client.chat.completions.create(
-                model=self.model,
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0,
-                max_tokens=200,
-            )
-            # Parse JSON response
-            response_text = response.choices[0].message.content.strip()
-            result = json.loads(response_text)
-            # Ensure all required fields are present
-            if not all(k in result for k in ["category", "confidence", "explanation"]):
-                raise ValueError("Missing required fields in LLM response")
-            # Validate category is in the list
-            if result["category"] not in categories:
-                result["category"] = categories[
-                    0
-                ]  # Default to first category if invalid
-            # Validate confidence is a number between 0 and 100
-            try:
-                result["confidence"] = float(result["confidence"])
-                if not 0 <= result["confidence"] <= 100:
-                    result["confidence"] = 50
-            except:
-                result["confidence"] = 50
-            return result
-        except json.JSONDecodeError:
-            # Fall back to simple parsing if JSON fails
-            category = categories[0]  # Default
-            for cat in categories:
-                if cat.lower() in response_text.lower():
-                    category = cat
-                    break
-            return {
-                "category": category,
-                "confidence": 50,
-                "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
-            }

requirements.txt CHANGED Viewed

@@ -1,9 +1,95 @@
-gradio>=4.0.0
-litellm>=1.10.0
-pandas>=2.0.0
-numpy>=1.24.0
-scikit-learn>=1.2.0
-openpyxl>=3.1.0
-torch>=2.0.0
-transformers>=4.30.0
-matplotlib>=3.7.0

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioop-lts==0.2.1
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+contourpy==1.3.2
+cycler==0.12.1
+distro==1.9.0
+et-xmlfile==2.0.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fonttools==4.57.0
+frozenlist==1.5.0
+fsspec==2025.3.2
+gradio==5.25.1
+gradio-client==1.8.0
+groovy==0.1.2
+h11==0.14.0
+httpcore==1.0.8
+httpx==0.28.1
+huggingface-hub==0.30.2
+idna==3.10
+importlib-metadata==8.6.1
+jinja2==3.1.6
+jiter==0.9.0
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.8
+litellm==1.66.1
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+matplotlib==3.10.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.4.3
+networkx==3.4.2
+numpy==2.2.4
+openai==1.74.0
+openpyxl==3.1.5
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pillow==11.2.1
+propcache==0.3.1
+pydantic==2.11.3
+pydantic-core==2.33.1
+pydub==0.25.1
+pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+rpds-py==0.24.0
+ruff==0.11.5
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+setuptools==78.1.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.46.2
+sympy==1.13.1
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.6.0
+tqdm==4.67.1
+transformers==4.51.3
+typer==0.15.2
+typing-extensions==4.13.2
+typing-inspection==0.4.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.1
+websockets==15.0.1
+yarl==1.19.0
+zipp==3.21.0

server.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional
+import json
+from classifiers.llm import LLMClassifier
+from litellm import completion
+import asyncio
+app = FastAPI()
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize the LLM classifier
+classifier = LLMClassifier(client=completion, model="gpt-3.5-turbo")
+class TextInput(BaseModel):
+    text: str
+    categories: Optional[List[str]] = None
+class ClassificationResponse(BaseModel):
+    category: str
+    confidence: float
+    explanation: str
+class CategorySuggestionResponse(BaseModel):
+    categories: List[str]
+@app.post("/classify", response_model=ClassificationResponse)
+async def classify_text(text_input: TextInput):
+    try:
+        # Use async classification
+        results = await classifier.classify_async(
+            [text_input.text],
+            text_input.categories
+        )
+        result = results[0]  # Get first result since we're classifying one text
+        return ClassificationResponse(
+            category=result["category"],
+            confidence=result["confidence"],
+            explanation=result["explanation"]
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/suggest-categories", response_model=CategorySuggestionResponse)
+async def suggest_categories(texts: List[str]):
+    try:
+        categories = await classifier._suggest_categories_async(texts)
+        return CategorySuggestionResponse(categories=categories)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

test_server.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import requests
+import json
+BASE_URL = "http://localhost:8000"
+def test_classify_text():
+    # Test with default categories
+    response = requests.post(
+        f"{BASE_URL}/classify",
+        json={"text": "This is a sample text about technology and innovation."}
+    )
+    print("Classification with default categories:")
+    print(json.dumps(response.json(), indent=2))
+    # Test with custom categories
+    response = requests.post(
+        f"{BASE_URL}/classify",
+        json={
+            "text": "This is a sample text about technology and innovation.",
+            "categories": ["Technology", "Business", "Science", "Sports"]
+        }
+    )
+    print("\nClassification with custom categories:")
+    print(json.dumps(response.json(), indent=2))
+def test_suggest_categories():
+    texts = [
+        "This is a text about artificial intelligence and machine learning.",
+        "A new breakthrough in quantum computing has been announced.",
+        "The latest smartphone features innovative camera technology."
+    ]
+    response = requests.post(
+        f"{BASE_URL}/suggest-categories",
+        json=texts
+    )
+    print("\nSuggested categories:")
+    print(json.dumps(response.json(), indent=2))
+if __name__ == "__main__":
+    print("Testing FastAPI server endpoints...")
+    test_classify_text()
+    test_suggest_categories()