Spaces:

simondh
/

classifieur

Sleeping

App Files Files

xet

Community

simondh commited on Apr 15

Commit

156898c

1 Parent(s): 4f9ecb6

add endpoints

Browse files

Files changed (5) hide show

app.py +19 -226
classifiers/llm.py +39 -35
process.py +103 -1
test_server.py +2 -1
utils.py +47 -1

app.py CHANGED Viewed

@@ -9,14 +9,15 @@ import matplotlib.pyplot as plt
 import logging
 from dotenv import load_dotenv
-from process import update_api_key, process_file_async, export_results
 from client import get_client, initialize_client
 # Load environment variables from .env file
 load_dotenv()
 # Import local modules
-from utils import load_data, visualize_results
 from prompts import (
     CATEGORY_SUGGESTION_PROMPT,
     ADDITIONAL_CATEGORY_PROMPT,
@@ -147,7 +148,7 @@ with gr.Blocks(title="Text Classification System") as demo:
                 )
         # Function to load file and suggest categories
-        def load_file_and_suggest_categories(file):
             if not file:
                 return (
                     [],
@@ -167,67 +168,17 @@ with gr.Blocks(title="Text Classification System") as demo:
                 columns = list(df.columns)
                 # Analyze columns to suggest text columns
-                suggested_text_columns = []
-                for col in columns:
-                    # Check if column contains text data
-                    if df[col].dtype == "object":  # String type
-                        # Check if column contains mostly text (not just numbers or dates)
-                        sample = df[col].head(100).dropna()
-                        if len(sample) > 0:
-                            # Check if most values contain spaces (indicating text)
-                            text_ratio = sum(" " in str(val) for val in sample) / len(
-                                sample
-                            )
-                            if (
-                                text_ratio > 0.3
-                            ):  # If more than 30% of values contain spaces
-                                suggested_text_columns.append(col)
-                # If no columns were suggested, use all object columns
-                if not suggested_text_columns:
-                    suggested_text_columns = [
-                        col for col in columns if df[col].dtype == "object"
-                    ]
-                # Get a sample of text for category suggestion
-                sample_texts = []
-                for col in suggested_text_columns:
-                    sample_texts.extend(df[col].head(5).tolist())
                 # Use LLM to suggest categories
                 if client:
-                    prompt = CATEGORY_SUGGESTION_PROMPT.format(
-                        "\n---\n".join(sample_texts[:5])
-                    )
-                    try:
-                        response = client.chat.completions.create(
-                            model="gpt-3.5-turbo",
-                            messages=[{"role": "user", "content": prompt}],
-                            temperature=0,
-                            max_tokens=100,
-                        )
-                        suggested_cats = [
-                            cat.strip()
-                            for cat in response.choices[0]
-                            .message.content.strip()
-                            .split(",")
-                        ]
-                    except:
-                        suggested_cats = [
-                            "Positive",
-                            "Negative",
-                            "Neutral",
-                            "Mixed",
-                            "Other",
-                        ]
                 else:
-                    suggested_cats = [
-                        "Positive",
-                        "Negative",
-                        "Neutral",
-                        "Mixed",
-                        "Other",
-                    ]
                 return (
                     columns,
@@ -295,7 +246,7 @@ with gr.Blocks(title="Text Classification System") as demo:
             )
         # Function to suggest a new category
-        def suggest_new_category(file, current_categories, text_columns):
             if not file or not text_columns:
                 return gr.CheckboxGroup(
                     choices=current_categories, value=current_categories
@@ -303,29 +254,16 @@ with gr.Blocks(title="Text Classification System") as demo:
             try:
                 df = load_data(file.name)
-                # Get sample texts from selected columns
-                sample_texts = []
-                for col in text_columns:
-                    sample_texts.extend(df[col].head(5).tolist())
                 if client:
-                    prompt = ADDITIONAL_CATEGORY_PROMPT.format(
-                        existing_categories=", ".join(current_categories),
-                        sample_texts="\n---\n".join(sample_texts[:10]),
                     )
-                    try:
-                        response = client.chat.completions.create(
-                            model="gpt-3.5-turbo",
-                            messages=[{"role": "user", "content": prompt}],
-                            temperature=0,
-                            max_tokens=50,
-                        )
-                        new_cat = response.choices[0].message.content.strip()
-                        if new_cat and new_cat not in current_categories:
-                            current_categories.append(new_cat)
-                    except:
-                        pass
                 return gr.CheckboxGroup(
                     choices=current_categories, value=current_categories
@@ -342,151 +280,6 @@ with gr.Blocks(title="Text Classification System") as demo:
             file_path = export_results(df, format_type)
             return gr.File(value=file_path, visible=True)
-        # Function to improve classification based on validation report
-        async def improve_classification_async(
-            df,
-            validation_report,
-            text_columns,
-            categories,
-            classifier_type,
-            show_explanations,
-            file,
-        ):
-            """Async version of improve_classification"""
-            if df is None or not validation_report:
-                return (
-                    df,
-                    validation_report,
-                    gr.Button(visible=False),
-                    gr.CheckboxGroup(choices=[], value=[]),
-                )
-            try:
-                # Extract insights from validation report
-                if client:
-                    prompt = VALIDATION_ANALYSIS_PROMPT.format(
-                        validation_report=validation_report,
-                        current_categories=categories,
-                    )
-                    try:
-                        response = client.chat.completions.create(
-                            model="gpt-4",
-                            messages=[{"role": "user", "content": prompt}],
-                            temperature=0,
-                            max_tokens=300,
-                        )
-                        improvements = json.loads(
-                            response.choices[0].message.content.strip()
-                        )
-                        # Get current categories
-                        current_categories = [
-                            cat.strip() for cat in categories.split(",")
-                        ]
-                        # If new categories are needed, suggest them based on the data
-                        if improvements.get("new_categories_needed", False):
-                            # Get sample texts for category suggestion
-                            sample_texts = []
-                            for col in text_columns:
-                                if isinstance(file, str):
-                                    temp_df = load_data(file)
-                                else:
-                                    temp_df = load_data(file.name)
-                                sample_texts.extend(temp_df[col].head(10).tolist())
-                            category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
-                                current_categories=", ".join(current_categories),
-                                analysis=improvements.get("analysis", ""),
-                                sample_texts="\n---\n".join(sample_texts[:10]),
-                            )
-                            category_response = client.chat.completions.create(
-                                model="gpt-4",
-                                messages=[{"role": "user", "content": category_prompt}],
-                                temperature=0,
-                                max_tokens=100,
-                            )
-                            new_categories = [
-                                cat.strip()
-                                for cat in category_response.choices[0]
-                                .message.content.strip()
-                                .split(",")
-                            ]
-                            # Combine current and new categories
-                            all_categories = current_categories + new_categories
-                            categories = ",".join(all_categories)
-                        # Process with improved parameters
-                        improved_df, new_validation = await process_file_async(
-                            file,
-                            text_columns,
-                            categories,
-                            classifier_type,
-                            show_explanations,
-                        )
-                        return (
-                            improved_df,
-                            new_validation,
-                            gr.Button(visible=True),
-                            gr.CheckboxGroup(
-                                choices=all_categories, value=all_categories
-                            ),
-                        )
-                    except Exception as e:
-                        print(f"Error in improvement process: {str(e)}")
-                        return (
-                            df,
-                            validation_report,
-                            gr.Button(visible=True),
-                            gr.CheckboxGroup(
-                                choices=current_categories, value=current_categories
-                            ),
-                        )
-                else:
-                    return (
-                        df,
-                        validation_report,
-                        gr.Button(visible=True),
-                        gr.CheckboxGroup(
-                            choices=current_categories, value=current_categories
-                        ),
-                    )
-            except Exception as e:
-                print(f"Error in improvement process: {str(e)}")
-                return (
-                    df,
-                    validation_report,
-                    gr.Button(visible=True),
-                    gr.CheckboxGroup(
-                        choices=current_categories, value=current_categories
-                    ),
-                )
-        def improve_classification(
-            df,
-            validation_report,
-            text_columns,
-            categories,
-            classifier_type,
-            show_explanations,
-            file,
-        ):
-            """Synchronous wrapper for improve_classification_async"""
-            return asyncio.run(
-                improve_classification_async(
-                    df,
-                    validation_report,
-                    text_columns,
-                    categories,
-                    classifier_type,
-                    show_explanations,
-                    file,
-                )
-            )
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,

 import logging
 from dotenv import load_dotenv
+from process import update_api_key, process_file_async, export_results, improve_classification
 from client import get_client, initialize_client
+from utils import load_data, visualize_results, analyze_text_columns, get_sample_texts
+from classifiers.llm import LLMClassifier
 # Load environment variables from .env file
 load_dotenv()
 # Import local modules
 from prompts import (
     CATEGORY_SUGGESTION_PROMPT,
     ADDITIONAL_CATEGORY_PROMPT,
                 )
         # Function to load file and suggest categories
+        async def load_file_and_suggest_categories(file):
             if not file:
                 return (
                     [],
                 columns = list(df.columns)
                 # Analyze columns to suggest text columns
+                suggested_text_columns = analyze_text_columns(df)
+                # Get sample texts for category suggestion
+                sample_texts = get_sample_texts(df, suggested_text_columns)
                 # Use LLM to suggest categories
                 if client:
+                    classifier = LLMClassifier(client=client)
+                    suggested_cats = await classifier.suggest_categories_from_texts(sample_texts)
                 else:
+                    suggested_cats = ["Positive", "Negative", "Neutral", "Mixed", "Other"]
                 return (
                     columns,
             )
         # Function to suggest a new category
+        async def suggest_new_category(file, current_categories, text_columns):
             if not file or not text_columns:
                 return gr.CheckboxGroup(
                     choices=current_categories, value=current_categories
             try:
                 df = load_data(file.name)
+                sample_texts = get_sample_texts(df, text_columns)
                 if client:
+                    classifier = LLMClassifier(client=client)
+                    new_categories = await classifier.suggest_categories_from_texts(
+                        sample_texts, current_categories
+                    )
+                    return gr.CheckboxGroup(
+                        choices=new_categories, value=new_categories
                     )
                 return gr.CheckboxGroup(
                     choices=current_categories, value=current_categories
             file_path = export_results(df, format_type)
             return gr.File(value=file_path, visible=True)
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,

classifiers/llm.py CHANGED Viewed

@@ -6,14 +6,14 @@ from sklearn.metrics.pairwise import cosine_similarity
 import random
 import json
 import asyncio
-from typing import List, Dict, Any, Optional, Union
 import sys
 import os
 from litellm import OpenAI
 # Add the project root to the Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 from .base import BaseClassifier
@@ -26,6 +26,43 @@ class LLMClassifier(BaseClassifier):
         self.client: OpenAI = client
         self.model: str = model
     async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Async version of text classification"""
         prompt: str = TEXT_CLASSIFICATION_PROMPT.format(
@@ -87,39 +124,6 @@ class LLMClassifier(BaseClassifier):
                 "explanation": f"Error during classification: {str(e)}",
             }
-    async def _suggest_categories_async(self, texts: List[str], sample_size: int = 20) -> List[str]:
-        """Async version of category suggestion"""
-        # Take a sample of texts to avoid token limitations
-        if len(texts) > sample_size:
-            sample_texts: List[str] = random.sample(texts, sample_size)
-        else:
-            sample_texts: List[str] = texts
-        prompt: str = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
-        try:
-            # Use the synchronous client method but run it in a thread pool
-            loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
-            response: Any = await loop.run_in_executor(
-                None,
-                lambda: self.client.chat.completions.create(
-                    model=self.model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=0.2,
-                    max_tokens=100,
-                )
-            )
-            # Parse response to get categories
-            categories_text: str = response.choices[0].message.content.strip()
-            categories: List[str] = [cat.strip() for cat in categories_text.split(",")]
-            return categories
-        except Exception as e:
-            # Fallback to default categories on error
-            print(f"Error suggesting categories: {str(e)}")
-            return self._generate_default_categories(texts)
     async def classify_async(
         self, texts: List[str], categories: Optional[List[str]] = None
     ) -> List[Dict[str, Any]]:

 import random
 import json
 import asyncio
+from typing import List, Dict, Any, Optional, Union, Tuple
 import sys
 import os
 from litellm import OpenAI
 # Add the project root to the Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT, ADDITIONAL_CATEGORY_PROMPT
 from .base import BaseClassifier
         self.client: OpenAI = client
         self.model: str = model
+    async def _suggest_categories_async(self, texts: List[str], sample_size: int = 20) -> List[str]:
+        """Async version of category suggestion"""
+        # Take a sample of texts to avoid token limitations
+        if len(texts) > sample_size:
+            sample_texts: List[str] = random.sample(texts, sample_size)
+        else:
+            sample_texts: List[str] = texts
+        prompt: str = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
+        try:
+            # Use the synchronous client method but run it in a thread pool
+            loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
+            response: Any = await loop.run_in_executor(
+                None,
+                lambda: self.client.chat.completions.create(
+                    model=self.model,
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.2,
+                    max_tokens=100,
+                )
+            )
+            # Parse response to get categories
+            categories_text: str = response.choices[0].message.content.strip()
+            categories: List[str] = [cat.strip() for cat in categories_text.split(",")]
+            return categories
+        except Exception as e:
+            # Fallback to default categories on error
+            print(f"Error suggesting categories: {str(e)}")
+            return self._generate_default_categories(texts)
+    def _generate_default_categories(self, texts: List[str]) -> List[str]:
+        """Generate default categories if LLM suggestion fails"""
+        return ["Positive", "Negative", "Neutral", "Mixed", "Other"]
     async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Async version of text classification"""
         prompt: str = TEXT_CLASSIFICATION_PROMPT.format(
                 "explanation": f"Error during classification: {str(e)}",
             }
     async def classify_async(
         self, texts: List[str], categories: Optional[List[str]] = None
     ) -> List[Dict[str, Any]]:

process.py CHANGED Viewed

@@ -6,10 +6,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from typing import Optional, List, Dict, Any, Tuple, Union
 import pandas as pd
 from pathlib import Path
 from classifiers import TFIDFClassifier, LLMClassifier
-from utils import load_data, validate_results
 from client import get_client
 def update_api_key(api_key: str) -> Tuple[bool, str]:
@@ -174,3 +176,103 @@ def export_results(df: pd.DataFrame, format_type: str) -> Optional[str]:
         df.to_csv(file_path, index=False)
     return file_path

 from typing import Optional, List, Dict, Any, Tuple, Union
 import pandas as pd
 from pathlib import Path
+import json
 from classifiers import TFIDFClassifier, LLMClassifier
+from utils import load_data, validate_results, get_sample_texts
 from client import get_client
+from prompts import VALIDATION_ANALYSIS_PROMPT, CATEGORY_IMPROVEMENT_PROMPT
 def update_api_key(api_key: str) -> Tuple[bool, str]:
         df.to_csv(file_path, index=False)
     return file_path
+async def improve_classification(
+    df: pd.DataFrame,
+    validation_report: str,
+    text_columns: List[str],
+    categories: str,
+    classifier_type: str,
+    show_explanations: bool,
+    file: Union[str, Path]
+) -> Tuple[Optional[pd.DataFrame], Optional[str], bool, List[str]]:
+    """
+    Improve classification based on validation report
+    Args:
+        df (pd.DataFrame): Current classification results
+        validation_report (str): Validation report from previous classification
+        text_columns (List[str]): List of text column names
+        categories (str): Comma-separated list of categories
+        classifier_type (str): Type of classifier to use
+        show_explanations (bool): Whether to show explanations
+        file (Union[str, Path]): Path to the input file
+    Returns:
+        Tuple[Optional[pd.DataFrame], Optional[str], bool, List[str]]:
+            - Improved dataframe
+            - New validation report
+            - Whether improvement was successful
+            - Updated categories
+    """
+    if df is None or not validation_report:
+        return None, validation_report, False, []
+    try:
+        client = get_client()
+        if not client:
+            return None, "Error: API client not initialized", False, []
+        # Extract insights from validation report
+        prompt = VALIDATION_ANALYSIS_PROMPT.format(
+            validation_report=validation_report,
+            current_categories=categories,
+        )
+        response = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: client.chat.completions.create(
+                model="gpt-4",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+                max_tokens=300,
+            )
+        )
+        improvements = json.loads(response.choices[0].message.content.strip())
+        current_categories = [cat.strip() for cat in categories.split(",")]
+        # If new categories are needed, suggest them based on the data
+        if improvements.get("new_categories_needed", False):
+            # Get sample texts for category suggestion
+            sample_texts = get_sample_texts(df, text_columns, sample_size=10)
+            category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
+                current_categories=", ".join(current_categories),
+                analysis=improvements.get("analysis", ""),
+                sample_texts="\n---\n".join(sample_texts)
+            )
+            category_response = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": category_prompt}],
+                    temperature=0,
+                    max_tokens=100,
+                )
+            )
+            new_categories = [
+                cat.strip()
+                for cat in category_response.choices[0].message.content.strip().split(",")
+            ]
+            # Combine current and new categories
+            all_categories = current_categories + new_categories
+            categories = ",".join(all_categories)
+        # Process with improved parameters
+        improved_df, new_validation = await process_file_async(
+            file,
+            text_columns,
+            categories,
+            classifier_type,
+            show_explanations,
+        )
+        return improved_df, new_validation, True, all_categories if improvements.get("new_categories_needed", False) else current_categories
+    except Exception as e:
+        print(f"Error in improvement process: {str(e)}")
+        return df, validation_report, False, current_categories

test_server.py CHANGED Viewed

@@ -107,7 +107,8 @@ def test_validate_classifications() -> None:
         f"{BASE_URL}/suggest-categories",
         json=[email["contenu"] for email in emails[:5]]
     )
-    current_categories: List[str] = categories_response.json()["categories"]
     # Send validation request
     validation_request: Dict[str, Any] = {

         f"{BASE_URL}/suggest-categories",
         json=[email["contenu"] for email in emails[:5]]
     )
+    response_data: Dict[str, Any] = categories_response.json()
+    current_categories: List[str] = response_data["categories"]  # Extract categories from the response
     # Send validation request
     validation_request: Dict[str, Any] = {

utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ from sklearn.decomposition import PCA
 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 from prompts import VALIDATION_PROMPT
-from typing import List, Optional, Any, Union
 from pathlib import Path
 from matplotlib.figure import Figure
@@ -33,6 +33,52 @@ def load_data(file_path: Union[str, Path]) -> pd.DataFrame:
         )
 def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str:
     """
     Export dataframe to file

 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 from prompts import VALIDATION_PROMPT
+from typing import List, Optional, Any, Union, Tuple
 from pathlib import Path
 from matplotlib.figure import Figure
         )
+def analyze_text_columns(df: pd.DataFrame) -> List[str]:
+    """
+    Analyze columns to suggest text columns based on content analysis
+    Args:
+        df (pd.DataFrame): Input dataframe
+    Returns:
+        List[str]: List of suggested text columns
+    """
+    suggested_text_columns: List[str] = []
+    for col in df.columns:
+        if df[col].dtype == "object":  # String type
+            # Check if column contains mostly text (not just numbers or dates)
+            sample = df[col].head(100).dropna()
+            if len(sample) > 0:
+                # Check if most values contain spaces (indicating text)
+                text_ratio = sum(" " in str(val) for val in sample) / len(sample)
+                if text_ratio > 0.3:  # If more than 30% of values contain spaces
+                    suggested_text_columns.append(col)
+    # If no columns were suggested, use all object columns
+    if not suggested_text_columns:
+        suggested_text_columns = [col for col in df.columns if df[col].dtype == "object"]
+    return suggested_text_columns
+def get_sample_texts(df: pd.DataFrame, text_columns: List[str], sample_size: int = 5) -> List[str]:
+    """
+    Get sample texts from specified columns
+    Args:
+        df (pd.DataFrame): Input dataframe
+        text_columns (List[str]): List of text column names
+        sample_size (int): Number of samples to take from each column
+    Returns:
+        List[str]: List of sample texts
+    """
+    sample_texts: List[str] = []
+    for col in text_columns:
+        sample_texts.extend(df[col].head(sample_size).tolist())
+    return sample_texts
 def export_data(df: pd.DataFrame, file_name: str, format_type: str = "excel") -> str:
     """
     Export dataframe to file