Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on Apr 14

Commit

6f39808

1 Parent(s): b9ab46a

black .

Browse files

Files changed (4) hide show

app.py +314 -159
classifiers.py +72 -66
prompts.py +1 -1
utils.py +61 -48

app.py CHANGED Viewed

@@ -20,12 +20,13 @@ from prompts import (
     CATEGORY_SUGGESTION_PROMPT,
     ADDITIONAL_CATEGORY_PROMPT,
     VALIDATION_ANALYSIS_PROMPT,
-    CATEGORY_IMPROVEMENT_PROMPT
 )
 # Configure logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 # Initialize API key from environment variable
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
@@ -39,22 +40,23 @@ if OPENAI_API_KEY:
     except Exception as e:
         logging.error(f"Failed to initialize OpenAI client: {str(e)}")
 def update_api_key(api_key):
     """Update the OpenAI API key"""
     global OPENAI_API_KEY, client
     if not api_key:
         return "API Key cannot be empty"
     OPENAI_API_KEY = api_key
     try:
         client = OpenAI(api_key=api_key)
         # Test the connection with a simple request
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "test"}],
-            max_tokens=5
         )
         return f"API Key updated and verified successfully"
     except Exception as e:
@@ -62,41 +64,45 @@ def update_api_key(api_key):
         logging.error(f"API key update failed: {error_msg}")
         return f"Failed to update API Key: {error_msg}"
 def process_file(file, text_columns, categories, classifier_type, show_explanations):
     """Process the uploaded file and classify text data"""
     # Initialize result_df and validation_report
     result_df = None
     validation_report = None
     try:
         # Load data from file
         if isinstance(file, str):
             df = load_data(file)
         else:
             df = load_data(file.name)
         if not text_columns:
             return None, "Please select at least one text column"
         # Check if all selected columns exist
         missing_columns = [col for col in text_columns if col not in df.columns]
         if missing_columns:
-            return None, f"Columns not found in the file: {', '.join(missing_columns)}. Available columns: {', '.join(df.columns)}"
         # Combine text from selected columns
         texts = []
         for _, row in df.iterrows():
             combined_text = " ".join(str(row[col]) for col in text_columns)
             texts.append(combined_text)
         # Parse categories if provided
         category_list = []
         if categories:
             category_list = [cat.strip() for cat in categories.split(",")]
         # Select classifier based on data size and user choice
         num_texts = len(texts)
         # If no specific model is chosen, select the most appropriate one
         if classifier_type == "auto":
             if num_texts <= 500:
@@ -107,30 +113,36 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
                 classifier_type = "hybrid"
             else:
                 classifier_type = "tfidf"
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
             classifier = TFIDFClassifier()
             results = classifier.classify(texts, category_list)
         elif classifier_type in ["gpt35", "gpt4"]:
             if client is None:
-                return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
             model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
             classifier = LLMClassifier(client=client, model=model)
             results = classifier.classify(texts, category_list)
         else:  # hybrid
             if client is None:
-                return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
             # First pass with TF-IDF
             tfidf_classifier = TFIDFClassifier()
             tfidf_results = tfidf_classifier.classify(texts, category_list)
             # Second pass with LLM for low confidence results
             llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
             results = []
             low_confidence_texts = []
             low_confidence_indices = []
             for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
                 if tfidf_result["confidence"] < 70:  # If confidence is below 70%
                     low_confidence_texts.append(text)
@@ -138,91 +150,97 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
                     results.append(None)  # Placeholder
                 else:
                     results.append(tfidf_result)
             if low_confidence_texts:
-                llm_results = llm_classifier.classify(low_confidence_texts, category_list)
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
                     results[idx] = llm_result
         # Create results dataframe
         result_df = df.copy()
         result_df["Category"] = [r["category"] for r in results]
         result_df["Confidence"] = [r["confidence"] for r in results]
         if show_explanations:
             result_df["Explanation"] = [r["explanation"] for r in results]
         # Validate results using LLM
         validation_report = validate_results(result_df, text_columns, client)
         return result_df, validation_report
     except Exception as e:
         error_traceback = traceback.format_exc()
         return None, f"Error: {str(e)}\n{error_traceback}"
 def export_results(df, format_type):
     """Export results to a file and return the file path for download"""
     if df is None:
         return None
     # Create a temporary file
     import tempfile
     import os
     # Create a temporary directory if it doesn't exist
     temp_dir = "temp_exports"
     os.makedirs(temp_dir, exist_ok=True)
     # Generate a unique filename
     timestamp = time.strftime("%Y%m%d-%H%M%S")
     filename = f"classification_results_{timestamp}"
     if format_type == "excel":
         file_path = os.path.join(temp_dir, f"{filename}.xlsx")
         df.to_excel(file_path, index=False)
     else:
         file_path = os.path.join(temp_dir, f"{filename}.csv")
         df.to_csv(file_path, index=False)
     return file_path
 # Create Gradio interface
 with gr.Blocks(title="Text Classification System") as demo:
     gr.Markdown("# Text Classification System")
     gr.Markdown("Upload your data file (Excel/CSV) and classify text using AI")
     with gr.Tab("Setup"):
         api_key_input = gr.Textbox(
             label="OpenAI API Key",
             placeholder="Enter your API key here",
             type="password",
-            value=OPENAI_API_KEY
         )
         api_key_button = gr.Button("Update API Key")
         api_key_message = gr.Textbox(label="Status", interactive=False)
         # Display current API status
-        api_status = "API Key is set" if OPENAI_API_KEY else "No API Key found. Please set one."
         gr.Markdown(f"**Current API Status**: {api_status}")
-        api_key_button.click(update_api_key, inputs=[api_key_input], outputs=[api_key_message])
     with gr.Tab("Classify Data"):
         with gr.Column():
             file_input = gr.File(label="Upload Excel/CSV File")
             # Variable to store available columns
             available_columns = gr.State([])
             # Button to load file and suggest categories
             load_categories_button = gr.Button("Load File")
             # Display original dataframe
             original_df = gr.Dataframe(
-                label="Original Data",
-                interactive=False,
-                visible=False
             )
             with gr.Row():
@@ -232,31 +250,29 @@ with gr.Blocks(title="Text Classification System") as demo:
                         choices=[],
                         value=[],
                         interactive=True,
-                        visible=False
                     )
                     new_category = gr.Textbox(
                         label="Add New Category",
                         placeholder="Enter a new category name",
-                        visible=False
                     )
                     with gr.Row():
                         add_category_button = gr.Button("Add Category", visible=False)
-                        suggest_category_button = gr.Button("Suggest Category", visible=False)
                     # Original categories input (hidden)
-                    categories = gr.Textbox(
-                        visible=False
-                    )
                 with gr.Column():
                     text_column = gr.CheckboxGroup(
-                        label="Select Text Columns",
-                        choices=[],
                         interactive=True,
-                        visible=False
                     )
                     classifier_type = gr.Dropdown(
@@ -264,18 +280,20 @@ with gr.Blocks(title="Text Classification System") as demo:
                             ("TF-IDF (Rapide, <1000 lignes)", "tfidf"),
                             ("LLM GPT-3.5 (Fiable, <1000 lignes)", "gpt35"),
                             ("LLM GPT-4 (Très fiable, <500 lignes)", "gpt4"),
-                            ("TF-IDF + LLM (Hybride, >1000 lignes)", "hybrid")
                         ],
                         label="Modèle de classification",
                         value="gpt35",
-                        visible=False
                     )
-                    show_explanations = gr.Checkbox(label="Show Explanations", value=True, visible=False)
                     process_button = gr.Button("Process and Classify", visible=False)
         results_df = gr.Dataframe(interactive=True, visible=False)
         # Create containers for visualization and validation report
         with gr.Row(visible=False) as results_row:
             with gr.Column():
@@ -284,161 +302,251 @@ with gr.Blocks(title="Text Classification System") as demo:
                     csv_download = gr.File(label="Download CSV", visible=False)
                     excel_download = gr.File(label="Download Excel", visible=False)
             with gr.Column():
-                validation_output = gr.Textbox(label="Validation Report", interactive=False)
-                improve_button = gr.Button("Improve Classification with Report", visible=False)
         # Function to load file and suggest categories
         def load_file_and_suggest_categories(file):
             if not file:
-                return [], gr.CheckboxGroup(choices=[]), gr.CheckboxGroup(choices=[], visible=False), gr.Textbox(visible=False), gr.Button(visible=False), gr.Button(visible=False), gr.CheckboxGroup(choices=[], visible=False), gr.Dropdown(visible=False), gr.Checkbox(visible=False), gr.Button(visible=False), gr.Dataframe(visible=False)
             try:
                 df = load_data(file.name)
                 columns = list(df.columns)
                 # Analyze columns to suggest text columns
                 suggested_text_columns = []
                 for col in columns:
                     # Check if column contains text data
-                    if df[col].dtype == 'object':  # String type
                         # Check if column contains mostly text (not just numbers or dates)
                         sample = df[col].head(100).dropna()
                         if len(sample) > 0:
                             # Check if most values contain spaces (indicating text)
-                            text_ratio = sum(' ' in str(val) for val in sample) / len(sample)
-                            if text_ratio > 0.3:  # If more than 30% of values contain spaces
                                 suggested_text_columns.append(col)
                 # If no columns were suggested, use all object columns
                 if not suggested_text_columns:
-                    suggested_text_columns = [col for col in columns if df[col].dtype == 'object']
                 # Get a sample of text for category suggestion
                 sample_texts = []
                 for col in suggested_text_columns:
                     sample_texts.extend(df[col].head(5).tolist())
                 # Use LLM to suggest categories
                 if client:
-                    prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts[:5]))
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
-                            max_tokens=100
                         )
-                        suggested_cats = [cat.strip() for cat in response.choices[0].message.content.strip().split(",")]
                     except:
-                        suggested_cats = ["Positive", "Negative", "Neutral", "Mixed", "Other"]
                 else:
-                    suggested_cats = ["Positive", "Negative", "Neutral", "Mixed", "Other"]
                 return (
-                    columns,
-                    gr.CheckboxGroup(choices=columns, value=suggested_text_columns),
-                    gr.CheckboxGroup(choices=suggested_cats, value=suggested_cats, visible=True),
                     gr.Textbox(visible=True),
                     gr.Button(visible=True),
                     gr.Button(visible=True),
-                    gr.CheckboxGroup(choices=columns, value=suggested_text_columns, visible=True),
                     gr.Dropdown(visible=True),
                     gr.Checkbox(visible=True),
                     gr.Button(visible=True),
-                    gr.Dataframe(value=df, visible=True)
                 )
             except Exception as e:
-                return [], gr.CheckboxGroup(choices=[]), gr.CheckboxGroup(choices=[], visible=False), gr.Textbox(visible=False), gr.Button(visible=False), gr.Button(visible=False), gr.CheckboxGroup(choices=[], visible=False), gr.Dropdown(visible=False), gr.Checkbox(visible=False), gr.Button(visible=False), gr.Dataframe(visible=False)
         # Function to add a new category
         def add_new_category(current_categories, new_category):
             if not new_category or new_category.strip() == "":
                 return current_categories
             new_categories = current_categories + [new_category.strip()]
             return gr.CheckboxGroup(choices=new_categories, value=new_categories)
         # Function to update categories textbox
         def update_categories_textbox(selected_categories):
             return ", ".join(selected_categories)
         # Function to show results after processing
         def show_results(df, validation_report):
             """Show the results after processing"""
             if df is None:
-                return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
             # Export to both formats
             csv_path = export_results(df, "csv")
             excel_path = export_results(df, "excel")
-            return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
         # Function to suggest a new category
         def suggest_new_category(file, current_categories, text_columns):
             if not file or not text_columns:
-                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
             try:
                 df = load_data(file.name)
                 # Get sample texts from selected columns
                 sample_texts = []
                 for col in text_columns:
                     sample_texts.extend(df[col].head(5).tolist())
                 if client:
                     prompt = ADDITIONAL_CATEGORY_PROMPT.format(
                         existing_categories=", ".join(current_categories),
-                        sample_texts="\n---\n".join(sample_texts[:10])
                     )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
-                            max_tokens=50
                         )
                         new_cat = response.choices[0].message.content.strip()
                         if new_cat and new_cat not in current_categories:
                             current_categories.append(new_cat)
                     except:
                         pass
-                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
             except Exception as e:
-                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
         # Function to handle export and show download button
         def handle_export(df, format_type):
             if df is None:
                 return gr.File(visible=False)
             file_path = export_results(df, format_type)
             return gr.File(value=file_path, visible=True)
         # Function to improve classification based on validation report
-        def improve_classification(df, validation_report, text_columns, categories, classifier_type, show_explanations, file):
             """Improve classification based on validation report"""
             if df is None or not validation_report:
-                return df, validation_report, gr.Button(visible=False), gr.CheckboxGroup(choices=[], value=[])
             try:
                 # Extract insights from validation report
                 if client:
                     prompt = VALIDATION_ANALYSIS_PROMPT.format(
                         validation_report=validation_report,
-                        current_categories=categories
                     )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-4",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
-                            max_tokens=300
                         )
-                        improvements = json.loads(response.choices[0].message.content.strip())
                         # Get current categories
-                        current_categories = [cat.strip() for cat in categories.split(",")]
                         # If new categories are needed, suggest them based on the data
                         if improvements.get("new_categories_needed", False):
                             # Get sample texts for category suggestion
@@ -449,51 +557,84 @@ with gr.Blocks(title="Text Classification System") as demo:
                                 else:
                                     temp_df = load_data(file.name)
                                 sample_texts.extend(temp_df[col].head(10).tolist())
                             category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
                                 current_categories=", ".join(current_categories),
-                                analysis=improvements.get('analysis', ''),
-                                sample_texts="\n---\n".join(sample_texts[:10])
                             )
                             category_response = client.chat.completions.create(
                                 model="gpt-4",
                                 messages=[{"role": "user", "content": category_prompt}],
                                 temperature=0,
-                                max_tokens=100
                             )
-                            new_categories = [cat.strip() for cat in category_response.choices[0].message.content.strip().split(",")]
                             # Combine current and new categories
                             all_categories = current_categories + new_categories
                             categories = ",".join(all_categories)
                         # Process with improved parameters
                         improved_df, new_validation = process_file(
                             file,
                             text_columns,
                             categories,
                             classifier_type,
-                            show_explanations
                         )
-                        return improved_df, new_validation, gr.Button(visible=True), gr.CheckboxGroup(choices=all_categories, value=all_categories)
                     except Exception as e:
                         print(f"Error in improvement process: {str(e)}")
-                        return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
                 else:
-                    return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
             except Exception as e:
                 print(f"Error in improvement process: {str(e)}")
-                return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,
             inputs=[file_input],
             outputs=[
-                available_columns,
-                text_column,
                 suggested_categories,
                 new_category,
                 add_category_button,
@@ -502,74 +643,88 @@ with gr.Blocks(title="Text Classification System") as demo:
                 classifier_type,
                 show_explanations,
                 process_button,
-                original_df
-            ]
         )
         add_category_button.click(
             add_new_category,
             inputs=[suggested_categories, new_category],
-            outputs=[suggested_categories]
         )
         suggested_categories.change(
             update_categories_textbox,
             inputs=[suggested_categories],
-            outputs=[categories]
         )
         suggest_category_button.click(
             suggest_new_category,
             inputs=[file_input, suggested_categories, text_column],
-            outputs=[suggested_categories]
         )
         process_button.click(
-            lambda: gr.Dataframe(visible=True),
-            inputs=[],
-            outputs=[results_df]
         ).then(
             process_file,
-            inputs=[file_input, text_column, categories, classifier_type, show_explanations],
-            outputs=[results_df, validation_output]
         ).then(
             show_results,
             inputs=[results_df, validation_output],
-            outputs=[results_row, csv_download, excel_download, results_df]
         ).then(
-            visualize_results,
-            inputs=[results_df, text_column],
-            outputs=[visualization]
         ).then(
-            lambda x: gr.Button(visible=True),
-            inputs=[],
-            outputs=[improve_button]
         )
         improve_button.click(
             improve_classification,
-            inputs=[results_df, validation_output, text_column, categories, classifier_type, show_explanations, file_input],
-            outputs=[results_df, validation_output, improve_button, suggested_categories]
         ).then(
             show_results,
             inputs=[results_df, validation_output],
-            outputs=[results_row, csv_download, excel_download, results_df]
         ).then(
-            visualize_results,
-            inputs=[results_df, text_column],
-            outputs=[visualization]
         )
 def create_example_data():
     """Create example data for demonstration"""
     from utils import create_example_file
     example_path = create_example_file()
     return f"Example file created at: {example_path}"
 if __name__ == "__main__":
     # Create examples directory and sample file if it doesn't exist
     if not os.path.exists("examples"):
         create_example_data()
     # Launch the Gradio app
     demo.launch()

     CATEGORY_SUGGESTION_PROMPT,
     ADDITIONAL_CATEGORY_PROMPT,
     VALIDATION_ANALYSIS_PROMPT,
+    CATEGORY_IMPROVEMENT_PROMPT,
 )
 # Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
 # Initialize API key from environment variable
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
     except Exception as e:
         logging.error(f"Failed to initialize OpenAI client: {str(e)}")
 def update_api_key(api_key):
     """Update the OpenAI API key"""
     global OPENAI_API_KEY, client
     if not api_key:
         return "API Key cannot be empty"
     OPENAI_API_KEY = api_key
     try:
         client = OpenAI(api_key=api_key)
         # Test the connection with a simple request
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "test"}],
+            max_tokens=5,
         )
         return f"API Key updated and verified successfully"
     except Exception as e:
         logging.error(f"API key update failed: {error_msg}")
         return f"Failed to update API Key: {error_msg}"
 def process_file(file, text_columns, categories, classifier_type, show_explanations):
     """Process the uploaded file and classify text data"""
     # Initialize result_df and validation_report
     result_df = None
     validation_report = None
     try:
         # Load data from file
         if isinstance(file, str):
             df = load_data(file)
         else:
             df = load_data(file.name)
         if not text_columns:
             return None, "Please select at least one text column"
         # Check if all selected columns exist
         missing_columns = [col for col in text_columns if col not in df.columns]
         if missing_columns:
+            return (
+                None,
+                f"Columns not found in the file: {', '.join(missing_columns)}. Available columns: {', '.join(df.columns)}",
+            )
         # Combine text from selected columns
         texts = []
         for _, row in df.iterrows():
             combined_text = " ".join(str(row[col]) for col in text_columns)
             texts.append(combined_text)
         # Parse categories if provided
         category_list = []
         if categories:
             category_list = [cat.strip() for cat in categories.split(",")]
         # Select classifier based on data size and user choice
         num_texts = len(texts)
         # If no specific model is chosen, select the most appropriate one
         if classifier_type == "auto":
             if num_texts <= 500:
                 classifier_type = "hybrid"
             else:
                 classifier_type = "tfidf"
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
             classifier = TFIDFClassifier()
             results = classifier.classify(texts, category_list)
         elif classifier_type in ["gpt35", "gpt4"]:
             if client is None:
+                return (
+                    None,
+                    "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
+                )
             model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
             classifier = LLMClassifier(client=client, model=model)
             results = classifier.classify(texts, category_list)
         else:  # hybrid
             if client is None:
+                return (
+                    None,
+                    "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
+                )
             # First pass with TF-IDF
             tfidf_classifier = TFIDFClassifier()
             tfidf_results = tfidf_classifier.classify(texts, category_list)
             # Second pass with LLM for low confidence results
             llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
             results = []
             low_confidence_texts = []
             low_confidence_indices = []
             for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
                 if tfidf_result["confidence"] < 70:  # If confidence is below 70%
                     low_confidence_texts.append(text)
                     results.append(None)  # Placeholder
                 else:
                     results.append(tfidf_result)
             if low_confidence_texts:
+                llm_results = llm_classifier.classify(
+                    low_confidence_texts, category_list
+                )
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
                     results[idx] = llm_result
         # Create results dataframe
         result_df = df.copy()
         result_df["Category"] = [r["category"] for r in results]
         result_df["Confidence"] = [r["confidence"] for r in results]
         if show_explanations:
             result_df["Explanation"] = [r["explanation"] for r in results]
         # Validate results using LLM
         validation_report = validate_results(result_df, text_columns, client)
         return result_df, validation_report
     except Exception as e:
         error_traceback = traceback.format_exc()
         return None, f"Error: {str(e)}\n{error_traceback}"
 def export_results(df, format_type):
     """Export results to a file and return the file path for download"""
     if df is None:
         return None
     # Create a temporary file
     import tempfile
     import os
     # Create a temporary directory if it doesn't exist
     temp_dir = "temp_exports"
     os.makedirs(temp_dir, exist_ok=True)
     # Generate a unique filename
     timestamp = time.strftime("%Y%m%d-%H%M%S")
     filename = f"classification_results_{timestamp}"
     if format_type == "excel":
         file_path = os.path.join(temp_dir, f"{filename}.xlsx")
         df.to_excel(file_path, index=False)
     else:
         file_path = os.path.join(temp_dir, f"{filename}.csv")
         df.to_csv(file_path, index=False)
     return file_path
 # Create Gradio interface
 with gr.Blocks(title="Text Classification System") as demo:
     gr.Markdown("# Text Classification System")
     gr.Markdown("Upload your data file (Excel/CSV) and classify text using AI")
     with gr.Tab("Setup"):
         api_key_input = gr.Textbox(
             label="OpenAI API Key",
             placeholder="Enter your API key here",
             type="password",
+            value=OPENAI_API_KEY,
         )
         api_key_button = gr.Button("Update API Key")
         api_key_message = gr.Textbox(label="Status", interactive=False)
         # Display current API status
+        api_status = (
+            "API Key is set" if OPENAI_API_KEY else "No API Key found. Please set one."
+        )
         gr.Markdown(f"**Current API Status**: {api_status}")
+        api_key_button.click(
+            update_api_key, inputs=[api_key_input], outputs=[api_key_message]
+        )
     with gr.Tab("Classify Data"):
         with gr.Column():
             file_input = gr.File(label="Upload Excel/CSV File")
             # Variable to store available columns
             available_columns = gr.State([])
             # Button to load file and suggest categories
             load_categories_button = gr.Button("Load File")
             # Display original dataframe
             original_df = gr.Dataframe(
+                label="Original Data", interactive=False, visible=False
             )
             with gr.Row():
                         choices=[],
                         value=[],
                         interactive=True,
+                        visible=False,
                     )
                     new_category = gr.Textbox(
                         label="Add New Category",
                         placeholder="Enter a new category name",
+                        visible=False,
                     )
                     with gr.Row():
                         add_category_button = gr.Button("Add Category", visible=False)
+                        suggest_category_button = gr.Button(
+                            "Suggest Category", visible=False
+                        )
                     # Original categories input (hidden)
+                    categories = gr.Textbox(visible=False)
                 with gr.Column():
                     text_column = gr.CheckboxGroup(
+                        label="Select Text Columns",
+                        choices=[],
                         interactive=True,
+                        visible=False,
                     )
                     classifier_type = gr.Dropdown(
                             ("TF-IDF (Rapide, <1000 lignes)", "tfidf"),
                             ("LLM GPT-3.5 (Fiable, <1000 lignes)", "gpt35"),
                             ("LLM GPT-4 (Très fiable, <500 lignes)", "gpt4"),
+                            ("TF-IDF + LLM (Hybride, >1000 lignes)", "hybrid"),
                         ],
                         label="Modèle de classification",
                         value="gpt35",
+                        visible=False,
+                    )
+                    show_explanations = gr.Checkbox(
+                        label="Show Explanations", value=True, visible=False
                     )
                     process_button = gr.Button("Process and Classify", visible=False)
         results_df = gr.Dataframe(interactive=True, visible=False)
         # Create containers for visualization and validation report
         with gr.Row(visible=False) as results_row:
             with gr.Column():
                     csv_download = gr.File(label="Download CSV", visible=False)
                     excel_download = gr.File(label="Download Excel", visible=False)
             with gr.Column():
+                validation_output = gr.Textbox(
+                    label="Validation Report", interactive=False
+                )
+                improve_button = gr.Button(
+                    "Improve Classification with Report", visible=False
+                )
         # Function to load file and suggest categories
         def load_file_and_suggest_categories(file):
             if not file:
+                return (
+                    [],
+                    gr.CheckboxGroup(choices=[]),
+                    gr.CheckboxGroup(choices=[], visible=False),
+                    gr.Textbox(visible=False),
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    gr.CheckboxGroup(choices=[], visible=False),
+                    gr.Dropdown(visible=False),
+                    gr.Checkbox(visible=False),
+                    gr.Button(visible=False),
+                    gr.Dataframe(visible=False),
+                )
             try:
                 df = load_data(file.name)
                 columns = list(df.columns)
                 # Analyze columns to suggest text columns
                 suggested_text_columns = []
                 for col in columns:
                     # Check if column contains text data
+                    if df[col].dtype == "object":  # String type
                         # Check if column contains mostly text (not just numbers or dates)
                         sample = df[col].head(100).dropna()
                         if len(sample) > 0:
                             # Check if most values contain spaces (indicating text)
+                            text_ratio = sum(" " in str(val) for val in sample) / len(
+                                sample
+                            )
+                            if (
+                                text_ratio > 0.3
+                            ):  # If more than 30% of values contain spaces
                                 suggested_text_columns.append(col)
                 # If no columns were suggested, use all object columns
                 if not suggested_text_columns:
+                    suggested_text_columns = [
+                        col for col in columns if df[col].dtype == "object"
+                    ]
                 # Get a sample of text for category suggestion
                 sample_texts = []
                 for col in suggested_text_columns:
                     sample_texts.extend(df[col].head(5).tolist())
                 # Use LLM to suggest categories
                 if client:
+                    prompt = CATEGORY_SUGGESTION_PROMPT.format(
+                        "\n---\n".join(sample_texts[:5])
+                    )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
+                            max_tokens=100,
                         )
+                        suggested_cats = [
+                            cat.strip()
+                            for cat in response.choices[0]
+                            .message.content.strip()
+                            .split(",")
+                        ]
                     except:
+                        suggested_cats = [
+                            "Positive",
+                            "Negative",
+                            "Neutral",
+                            "Mixed",
+                            "Other",
+                        ]
                 else:
+                    suggested_cats = [
+                        "Positive",
+                        "Negative",
+                        "Neutral",
+                        "Mixed",
+                        "Other",
+                    ]
                 return (
+                    columns,
+                    gr.CheckboxGroup(choices=columns, value=suggested_text_columns),
+                    gr.CheckboxGroup(
+                        choices=suggested_cats, value=suggested_cats, visible=True
+                    ),
                     gr.Textbox(visible=True),
                     gr.Button(visible=True),
                     gr.Button(visible=True),
+                    gr.CheckboxGroup(
+                        choices=columns, value=suggested_text_columns, visible=True
+                    ),
                     gr.Dropdown(visible=True),
                     gr.Checkbox(visible=True),
                     gr.Button(visible=True),
+                    gr.Dataframe(value=df, visible=True),
                 )
             except Exception as e:
+                return (
+                    [],
+                    gr.CheckboxGroup(choices=[]),
+                    gr.CheckboxGroup(choices=[], visible=False),
+                    gr.Textbox(visible=False),
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    gr.CheckboxGroup(choices=[], visible=False),
+                    gr.Dropdown(visible=False),
+                    gr.Checkbox(visible=False),
+                    gr.Button(visible=False),
+                    gr.Dataframe(visible=False),
+                )
         # Function to add a new category
         def add_new_category(current_categories, new_category):
             if not new_category or new_category.strip() == "":
                 return current_categories
             new_categories = current_categories + [new_category.strip()]
             return gr.CheckboxGroup(choices=new_categories, value=new_categories)
         # Function to update categories textbox
         def update_categories_textbox(selected_categories):
             return ", ".join(selected_categories)
         # Function to show results after processing
         def show_results(df, validation_report):
             """Show the results after processing"""
             if df is None:
+                return (
+                    gr.Row(visible=False),
+                    gr.File(visible=False),
+                    gr.File(visible=False),
+                    gr.Dataframe(visible=False),
+                )
             # Export to both formats
             csv_path = export_results(df, "csv")
             excel_path = export_results(df, "excel")
+            return (
+                gr.Row(visible=True),
+                gr.File(value=csv_path, visible=True),
+                gr.File(value=excel_path, visible=True),
+                gr.Dataframe(value=df, visible=True),
+            )
         # Function to suggest a new category
         def suggest_new_category(file, current_categories, text_columns):
             if not file or not text_columns:
+                return gr.CheckboxGroup(
+                    choices=current_categories, value=current_categories
+                )
             try:
                 df = load_data(file.name)
                 # Get sample texts from selected columns
                 sample_texts = []
                 for col in text_columns:
                     sample_texts.extend(df[col].head(5).tolist())
                 if client:
                     prompt = ADDITIONAL_CATEGORY_PROMPT.format(
                         existing_categories=", ".join(current_categories),
+                        sample_texts="\n---\n".join(sample_texts[:10]),
                     )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
+                            max_tokens=50,
                         )
                         new_cat = response.choices[0].message.content.strip()
                         if new_cat and new_cat not in current_categories:
                             current_categories.append(new_cat)
                     except:
                         pass
+                return gr.CheckboxGroup(
+                    choices=current_categories, value=current_categories
+                )
             except Exception as e:
+                return gr.CheckboxGroup(
+                    choices=current_categories, value=current_categories
+                )
         # Function to handle export and show download button
         def handle_export(df, format_type):
             if df is None:
                 return gr.File(visible=False)
             file_path = export_results(df, format_type)
             return gr.File(value=file_path, visible=True)
         # Function to improve classification based on validation report
+        def improve_classification(
+            df,
+            validation_report,
+            text_columns,
+            categories,
+            classifier_type,
+            show_explanations,
+            file,
+        ):
             """Improve classification based on validation report"""
             if df is None or not validation_report:
+                return (
+                    df,
+                    validation_report,
+                    gr.Button(visible=False),
+                    gr.CheckboxGroup(choices=[], value=[]),
+                )
             try:
                 # Extract insights from validation report
                 if client:
                     prompt = VALIDATION_ANALYSIS_PROMPT.format(
                         validation_report=validation_report,
+                        current_categories=categories,
                     )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-4",
                             messages=[{"role": "user", "content": prompt}],
                             temperature=0,
+                            max_tokens=300,
+                        )
+                        improvements = json.loads(
+                            response.choices[0].message.content.strip()
                         )
                         # Get current categories
+                        current_categories = [
+                            cat.strip() for cat in categories.split(",")
+                        ]
                         # If new categories are needed, suggest them based on the data
                         if improvements.get("new_categories_needed", False):
                             # Get sample texts for category suggestion
                                 else:
                                     temp_df = load_data(file.name)
                                 sample_texts.extend(temp_df[col].head(10).tolist())
                             category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
                                 current_categories=", ".join(current_categories),
+                                analysis=improvements.get("analysis", ""),
+                                sample_texts="\n---\n".join(sample_texts[:10]),
                             )
                             category_response = client.chat.completions.create(
                                 model="gpt-4",
                                 messages=[{"role": "user", "content": category_prompt}],
                                 temperature=0,
+                                max_tokens=100,
                             )
+                            new_categories = [
+                                cat.strip()
+                                for cat in category_response.choices[0]
+                                .message.content.strip()
+                                .split(",")
+                            ]
                             # Combine current and new categories
                             all_categories = current_categories + new_categories
                             categories = ",".join(all_categories)
                         # Process with improved parameters
                         improved_df, new_validation = process_file(
                             file,
                             text_columns,
                             categories,
                             classifier_type,
+                            show_explanations,
+                        )
+                        return (
+                            improved_df,
+                            new_validation,
+                            gr.Button(visible=True),
+                            gr.CheckboxGroup(
+                                choices=all_categories, value=all_categories
+                            ),
                         )
                     except Exception as e:
                         print(f"Error in improvement process: {str(e)}")
+                        return (
+                            df,
+                            validation_report,
+                            gr.Button(visible=True),
+                            gr.CheckboxGroup(
+                                choices=current_categories, value=current_categories
+                            ),
+                        )
                 else:
+                    return (
+                        df,
+                        validation_report,
+                        gr.Button(visible=True),
+                        gr.CheckboxGroup(
+                            choices=current_categories, value=current_categories
+                        ),
+                    )
             except Exception as e:
                 print(f"Error in improvement process: {str(e)}")
+                return (
+                    df,
+                    validation_report,
+                    gr.Button(visible=True),
+                    gr.CheckboxGroup(
+                        choices=current_categories, value=current_categories
+                    ),
+                )
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,
             inputs=[file_input],
             outputs=[
+                available_columns,
+                text_column,
                 suggested_categories,
                 new_category,
                 add_category_button,
                 classifier_type,
                 show_explanations,
                 process_button,
+                original_df,
+            ],
         )
         add_category_button.click(
             add_new_category,
             inputs=[suggested_categories, new_category],
+            outputs=[suggested_categories],
         )
         suggested_categories.change(
             update_categories_textbox,
             inputs=[suggested_categories],
+            outputs=[categories],
         )
         suggest_category_button.click(
             suggest_new_category,
             inputs=[file_input, suggested_categories, text_column],
+            outputs=[suggested_categories],
         )
         process_button.click(
+            lambda: gr.Dataframe(visible=True), inputs=[], outputs=[results_df]
         ).then(
             process_file,
+            inputs=[
+                file_input,
+                text_column,
+                categories,
+                classifier_type,
+                show_explanations,
+            ],
+            outputs=[results_df, validation_output],
         ).then(
             show_results,
             inputs=[results_df, validation_output],
+            outputs=[results_row, csv_download, excel_download, results_df],
         ).then(
+            visualize_results, inputs=[results_df, text_column], outputs=[visualization]
         ).then(
+            lambda x: gr.Button(visible=True), inputs=[], outputs=[improve_button]
         )
         improve_button.click(
             improve_classification,
+            inputs=[
+                results_df,
+                validation_output,
+                text_column,
+                categories,
+                classifier_type,
+                show_explanations,
+                file_input,
+            ],
+            outputs=[
+                results_df,
+                validation_output,
+                improve_button,
+                suggested_categories,
+            ],
         ).then(
             show_results,
             inputs=[results_df, validation_output],
+            outputs=[results_row, csv_download, excel_download, results_df],
         ).then(
+            visualize_results, inputs=[results_df, text_column], outputs=[visualization]
         )
 def create_example_data():
     """Create example data for demonstration"""
     from utils import create_example_file
     example_path = create_example_file()
     return f"Example file created at: {example_path}"
 if __name__ == "__main__":
     # Create examples directory and sample file if it doesn't exist
     if not os.path.exists("examples"):
         create_example_data()
     # Launch the Gradio app
     demo.launch()

classifiers.py CHANGED Viewed

@@ -9,32 +9,34 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 class BaseClassifier:
     """Base class for text classifiers"""
     def __init__(self):
         pass
     def classify(self, texts, categories=None):
         """
         Classify a list of texts into categories
         Args:
             texts (list): List of text strings to classify
             categories (list, optional): List of category names. If None, categories will be auto-detected
         Returns:
             list: List of classification results with categories, confidence scores, and explanations
         """
         raise NotImplementedError("Subclasses must implement this method")
     def _generate_default_categories(self, texts, num_clusters=5):
         """
         Generate default categories based on text clustering
         Args:
             texts (list): List of text strings
             num_clusters (int): Number of clusters to generate
         Returns:
             list: List of category names
         """
@@ -45,25 +47,23 @@ class BaseClassifier:
 class TFIDFClassifier(BaseClassifier):
     """Classifier using TF-IDF and clustering for fast classification"""
     def __init__(self):
         super().__init__()
         self.vectorizer = TfidfVectorizer(
-            max_features=1000,
-            stop_words='english',
-            ngram_range=(1, 2)
         )
         self.model = None
         self.feature_names = None
         self.categories = None
         self.centroids = None
     def classify(self, texts, categories=None):
         """Classify texts using TF-IDF and clustering"""
         # Vectorize the texts
         X = self.vectorizer.fit_transform(texts)
         self.feature_names = self.vectorizer.get_feature_names_out()
         # Auto-detect categories if not provided
         if not categories:
             num_clusters = min(5, len(texts))  # Don't create more clusters than texts
@@ -71,98 +71,106 @@ class TFIDFClassifier(BaseClassifier):
         else:
             self.categories = categories
             num_clusters = len(categories)
         # Cluster the texts
         self.model = KMeans(n_clusters=num_clusters, random_state=42)
         clusters = self.model.fit_predict(X)
         self.centroids = self.model.cluster_centers_
         # Calculate distances to centroids for confidence
         distances = self._calculate_distances(X)
         # Prepare results
         results = []
         for i, text in enumerate(texts):
             cluster_idx = clusters[i]
             # Calculate confidence (inverse of distance, normalized)
             confidence = self._calculate_confidence(distances[i])
             # Create explanation
             explanation = self._generate_explanation(X[i], cluster_idx)
-            results.append({
-                "category": self.categories[cluster_idx],
-                "confidence": confidence,
-                "explanation": explanation
-            })
         return results
     def _calculate_distances(self, X):
         """Calculate distances from each point to each centroid"""
-        return np.sqrt(((X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2).sum(axis=2))
     def _calculate_confidence(self, distances):
         """Convert distances to confidence scores (0-100)"""
         min_dist = np.min(distances)
         max_dist = np.max(distances)
         # Normalize and invert (smaller distance = higher confidence)
         if max_dist == min_dist:
             return 70  # Default mid-range confidence when all distances are equal
         normalized_dist = (distances - min_dist) / (max_dist - min_dist)
         min_normalized = np.min(normalized_dist)
         # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
         confidence = 100 - (min_normalized * 50)
         return round(confidence, 1)
     def _generate_explanation(self, text_vector, cluster_idx):
         """Generate an explanation for the classification"""
         # Get the most important features for this cluster
         centroid = self.centroids[cluster_idx]
         # Get indices of top features for this text
         text_array = text_vector.toarray()[0]
         top_indices = text_array.argsort()[-5:][::-1]
         # Get the feature names for these indices
         top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
         if not top_features:
             return "No significant features identified for this classification."
         explanation = f"Classification based on key terms: {', '.join(top_features)}"
         return explanation
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
     def __init__(self, client, model="gpt-3.5-turbo"):
         super().__init__()
         self.client = client
         self.model = model
-    def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
         """Classify texts using an LLM with parallel processing"""
         if not categories:
             # First, use LLM to generate appropriate categories
             categories = self._suggest_categories(texts)
         # Process texts in parallel
         with ThreadPoolExecutor(max_workers=10) as executor:
             # Submit all tasks with their original indices
             future_to_index = {
-                executor.submit(self._classify_text, text, categories): idx
                 for idx, text in enumerate(texts)
             }
             # Initialize results list with None values
             results = [None] * len(texts)
             # Collect results as they complete
             for future in as_completed(future_to_index):
                 original_idx = future_to_index[future]
@@ -174,11 +182,11 @@ class LLMClassifier(BaseClassifier):
                     results[original_idx] = {
                         "category": categories[0],
                         "confidence": 50,
-                        "explanation": f"Error during classification: {str(e)}"
                     }
         return results
     def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
         """Use LLM to suggest appropriate categories for the dataset"""
         # Take a sample of texts to avoid token limitations
@@ -186,54 +194,55 @@ class LLMClassifier(BaseClassifier):
             sample_texts = random.sample(texts, sample_size)
         else:
             sample_texts = texts
         prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
         try:
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0.2,
-                max_tokens=100
             )
             # Parse response to get categories
             categories_text = response.choices[0].message.content.strip()
             categories = [cat.strip() for cat in categories_text.split(",")]
             return categories
         except Exception as e:
             # Fallback to default categories on error
             print(f"Error suggesting categories: {str(e)}")
             return self._generate_default_categories(texts)
     def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Use LLM to classify a single text"""
         prompt = TEXT_CLASSIFICATION_PROMPT.format(
-            categories=", ".join(categories),
-            text=text
         )
         try:
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0,
-                max_tokens=200
             )
             # Parse JSON response
             response_text = response.choices[0].message.content.strip()
             result = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
                 raise ValueError("Missing required fields in LLM response")
             # Validate category is in the list
             if result["category"] not in categories:
-                result["category"] = categories[0]  # Default to first category if invalid
             # Validate confidence is a number between 0 and 100
             try:
                 result["confidence"] = float(result["confidence"])
@@ -241,7 +250,7 @@ class LLMClassifier(BaseClassifier):
                     result["confidence"] = 50
             except:
                 result["confidence"] = 50
             return result
         except json.JSONDecodeError:
             # Fall back to simple parsing if JSON fails
@@ -250,12 +259,9 @@ class LLMClassifier(BaseClassifier):
                 if cat.lower() in response_text.lower():
                     category = cat
                     break
             return {
                 "category": category,
                 "confidence": 50,
-                "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)"
             }

 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 class BaseClassifier:
     """Base class for text classifiers"""
     def __init__(self):
         pass
     def classify(self, texts, categories=None):
         """
         Classify a list of texts into categories
         Args:
             texts (list): List of text strings to classify
             categories (list, optional): List of category names. If None, categories will be auto-detected
         Returns:
             list: List of classification results with categories, confidence scores, and explanations
         """
         raise NotImplementedError("Subclasses must implement this method")
     def _generate_default_categories(self, texts, num_clusters=5):
         """
         Generate default categories based on text clustering
         Args:
             texts (list): List of text strings
             num_clusters (int): Number of clusters to generate
         Returns:
             list: List of category names
         """
 class TFIDFClassifier(BaseClassifier):
     """Classifier using TF-IDF and clustering for fast classification"""
     def __init__(self):
         super().__init__()
         self.vectorizer = TfidfVectorizer(
+            max_features=1000, stop_words="english", ngram_range=(1, 2)
         )
         self.model = None
         self.feature_names = None
         self.categories = None
         self.centroids = None
     def classify(self, texts, categories=None):
         """Classify texts using TF-IDF and clustering"""
         # Vectorize the texts
         X = self.vectorizer.fit_transform(texts)
         self.feature_names = self.vectorizer.get_feature_names_out()
         # Auto-detect categories if not provided
         if not categories:
             num_clusters = min(5, len(texts))  # Don't create more clusters than texts
         else:
             self.categories = categories
             num_clusters = len(categories)
         # Cluster the texts
         self.model = KMeans(n_clusters=num_clusters, random_state=42)
         clusters = self.model.fit_predict(X)
         self.centroids = self.model.cluster_centers_
         # Calculate distances to centroids for confidence
         distances = self._calculate_distances(X)
         # Prepare results
         results = []
         for i, text in enumerate(texts):
             cluster_idx = clusters[i]
             # Calculate confidence (inverse of distance, normalized)
             confidence = self._calculate_confidence(distances[i])
             # Create explanation
             explanation = self._generate_explanation(X[i], cluster_idx)
+            results.append(
+                {
+                    "category": self.categories[cluster_idx],
+                    "confidence": confidence,
+                    "explanation": explanation,
+                }
+            )
         return results
     def _calculate_distances(self, X):
         """Calculate distances from each point to each centroid"""
+        return np.sqrt(
+            (
+                (X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2
+            ).sum(axis=2)
+        )
     def _calculate_confidence(self, distances):
         """Convert distances to confidence scores (0-100)"""
         min_dist = np.min(distances)
         max_dist = np.max(distances)
         # Normalize and invert (smaller distance = higher confidence)
         if max_dist == min_dist:
             return 70  # Default mid-range confidence when all distances are equal
         normalized_dist = (distances - min_dist) / (max_dist - min_dist)
         min_normalized = np.min(normalized_dist)
         # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
         confidence = 100 - (min_normalized * 50)
         return round(confidence, 1)
     def _generate_explanation(self, text_vector, cluster_idx):
         """Generate an explanation for the classification"""
         # Get the most important features for this cluster
         centroid = self.centroids[cluster_idx]
         # Get indices of top features for this text
         text_array = text_vector.toarray()[0]
         top_indices = text_array.argsort()[-5:][::-1]
         # Get the feature names for these indices
         top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
         if not top_features:
             return "No significant features identified for this classification."
         explanation = f"Classification based on key terms: {', '.join(top_features)}"
         return explanation
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
     def __init__(self, client, model="gpt-3.5-turbo"):
         super().__init__()
         self.client = client
         self.model = model
+    def classify(
+        self, texts: List[str], categories: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
         """Classify texts using an LLM with parallel processing"""
         if not categories:
             # First, use LLM to generate appropriate categories
             categories = self._suggest_categories(texts)
         # Process texts in parallel
         with ThreadPoolExecutor(max_workers=10) as executor:
             # Submit all tasks with their original indices
             future_to_index = {
+                executor.submit(self._classify_text, text, categories): idx
                 for idx, text in enumerate(texts)
             }
             # Initialize results list with None values
             results = [None] * len(texts)
             # Collect results as they complete
             for future in as_completed(future_to_index):
                 original_idx = future_to_index[future]
                     results[original_idx] = {
                         "category": categories[0],
                         "confidence": 50,
+                        "explanation": f"Error during classification: {str(e)}",
                     }
         return results
     def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
         """Use LLM to suggest appropriate categories for the dataset"""
         # Take a sample of texts to avoid token limitations
             sample_texts = random.sample(texts, sample_size)
         else:
             sample_texts = texts
         prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
         try:
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0.2,
+                max_tokens=100,
             )
             # Parse response to get categories
             categories_text = response.choices[0].message.content.strip()
             categories = [cat.strip() for cat in categories_text.split(",")]
             return categories
         except Exception as e:
             # Fallback to default categories on error
             print(f"Error suggesting categories: {str(e)}")
             return self._generate_default_categories(texts)
     def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Use LLM to classify a single text"""
         prompt = TEXT_CLASSIFICATION_PROMPT.format(
+            categories=", ".join(categories), text=text
         )
         try:
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0,
+                max_tokens=200,
             )
             # Parse JSON response
             response_text = response.choices[0].message.content.strip()
             result = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
                 raise ValueError("Missing required fields in LLM response")
             # Validate category is in the list
             if result["category"] not in categories:
+                result["category"] = categories[
+                    0
+                ]  # Default to first category if invalid
             # Validate confidence is a number between 0 and 100
             try:
                 result["confidence"] = float(result["confidence"])
                     result["confidence"] = 50
             except:
                 result["confidence"] = 50
             return result
         except json.JSONDecodeError:
             # Fall back to simple parsing if JSON fails
                 if cat.lower() in response_text.lower():
                     category = cat
                     break
             return {
                 "category": category,
                 "confidence": 50,
+                "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
             }

prompts.py CHANGED Viewed

@@ -60,4 +60,4 @@ Example texts:
 {}
 Return your answer as a comma-separated list of new category names only.
-"""

 {}
 Return your answer as a comma-separated list of new category names only.
+"""

utils.py CHANGED Viewed

@@ -6,61 +6,66 @@ from sklearn.decomposition import PCA
 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 def load_data(file_path):
     """
     Load data from an Excel or CSV file
     Args:
         file_path (str): Path to the file
     Returns:
         pd.DataFrame: Loaded data
     """
     file_ext = os.path.splitext(file_path)[1].lower()
-    if file_ext == '.xlsx' or file_ext == '.xls':
         return pd.read_excel(file_path)
-    elif file_ext == '.csv':
         return pd.read_csv(file_path)
     else:
-        raise ValueError(f"Unsupported file format: {file_ext}. Please upload an Excel or CSV file.")
 def export_data(df, file_name, format_type="excel"):
     """
     Export dataframe to file
     Args:
         df (pd.DataFrame): Dataframe to export
         file_name (str): Name of the output file
         format_type (str): "excel" or "csv"
     Returns:
         str: Path to the exported file
     """
     # Create export directory if it doesn't exist
     export_dir = "exports"
     os.makedirs(export_dir, exist_ok=True)
     # Full path for the export file
     export_path = os.path.join(export_dir, file_name)
     # Export based on format type
     if format_type == "excel":
         df.to_excel(export_path, index=False)
     else:
         df.to_csv(export_path, index=False)
     return export_path
 def visualize_results(df, text_column, category_column="Category"):
     """
     Create visualization of classification results
     Args:
         df (pd.DataFrame): Dataframe with classification results
         text_column (str): Name of the column containing text data
         category_column (str): Name of the column containing categories
     Returns:
         matplotlib.figure.Figure: Visualization figure
     """
@@ -68,52 +73,58 @@ def visualize_results(df, text_column, category_column="Category"):
     if category_column not in df.columns:
         # Create a simple figure with a message
         fig, ax = plt.subplots(figsize=(10, 6))
-        ax.text(0.5, 0.5, "No categories to display",
-                ha='center', va='center', fontsize=12)
-        ax.set_title('No Classification Results Available')
         plt.tight_layout()
         return fig
     # Get categories and their counts
     category_counts = df[category_column].value_counts()
     # Create a new figure
     fig, ax = plt.subplots(figsize=(10, 6))
     # Create the histogram
     bars = ax.bar(category_counts.index, category_counts.values)
     # Add value labels on top of each bar
     for bar in bars:
         height = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width()/2., height,
-                f'{int(height)}',
-                ha='center', va='bottom')
     # Customize the plot
-    ax.set_xlabel('Categories')
-    ax.set_ylabel('Number of Texts')
-    ax.set_title('Distribution of Classified Texts')
     # Rotate x-axis labels if they're too long
-    plt.xticks(rotation=45, ha='right')
     # Add grid
-    ax.grid(True, linestyle='--', alpha=0.7)
     plt.tight_layout()
     return fig
 def validate_results(df, text_columns, client):
     """
     Use LLM to validate the classification results
     Args:
         df (pd.DataFrame): Dataframe with classification results
         text_columns (list): List of column names containing text data
         client: LiteLLM client
     Returns:
         str: Validation report
     """
@@ -121,7 +132,7 @@ def validate_results(df, text_columns, client):
         # Sample a few rows for validation
         sample_size = min(5, len(df))
         sample_df = df.sample(n=sample_size, random_state=42)
         # Build validation prompt
         validation_prompts = []
         for _, row in sample_df.iterrows():
@@ -129,11 +140,11 @@ def validate_results(df, text_columns, client):
             text = " ".join(str(row[col]) for col in text_columns)
             assigned_category = row["Category"]
             confidence = row["Confidence"]
             validation_prompts.append(
                 f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
             )
         prompt = """
         As a validation expert, review the following text classifications and provide feedback.
         For each text, assess whether the assigned category seems appropriate:
@@ -146,19 +157,21 @@ def validate_results(df, text_columns, client):
         3. Suggestions for improvement
         Keep your response under 300 words.
-        """.format("\n---\n".join(validation_prompts))
         # Call LLM API
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.3,
-            max_tokens=400
         )
         validation_report = response.choices[0].message.content.strip()
         return validation_report
     except Exception as e:
         return f"Validation failed: {str(e)}"
@@ -166,7 +179,7 @@ def validate_results(df, text_columns, client):
 def create_example_file():
     """
     Create an example CSV file for testing
     Returns:
         str: Path to the created file
     """
@@ -182,17 +195,17 @@ def create_example_file():
             "It's okay, nothing special but gets the job done.",
             "I'm extremely disappointed with the quality of this product.",
             "This is the best purchase I've made all year!",
-            "It's reasonably priced and works as expected."
         ]
     }
     # Create dataframe
     df = pd.DataFrame(data)
     # Save to a CSV file
     example_dir = "examples"
     os.makedirs(example_dir, exist_ok=True)
     file_path = os.path.join(example_dir, "sample_reviews.csv")
     df.to_csv(file_path, index=False)
     return file_path

 from sklearn.feature_extraction.text import TfidfVectorizer
 import tempfile
 def load_data(file_path):
     """
     Load data from an Excel or CSV file
     Args:
         file_path (str): Path to the file
     Returns:
         pd.DataFrame: Loaded data
     """
     file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == ".xlsx" or file_ext == ".xls":
         return pd.read_excel(file_path)
+    elif file_ext == ".csv":
         return pd.read_csv(file_path)
     else:
+        raise ValueError(
+            f"Unsupported file format: {file_ext}. Please upload an Excel or CSV file."
+        )
 def export_data(df, file_name, format_type="excel"):
     """
     Export dataframe to file
     Args:
         df (pd.DataFrame): Dataframe to export
         file_name (str): Name of the output file
         format_type (str): "excel" or "csv"
     Returns:
         str: Path to the exported file
     """
     # Create export directory if it doesn't exist
     export_dir = "exports"
     os.makedirs(export_dir, exist_ok=True)
     # Full path for the export file
     export_path = os.path.join(export_dir, file_name)
     # Export based on format type
     if format_type == "excel":
         df.to_excel(export_path, index=False)
     else:
         df.to_csv(export_path, index=False)
     return export_path
 def visualize_results(df, text_column, category_column="Category"):
     """
     Create visualization of classification results
     Args:
         df (pd.DataFrame): Dataframe with classification results
         text_column (str): Name of the column containing text data
         category_column (str): Name of the column containing categories
     Returns:
         matplotlib.figure.Figure: Visualization figure
     """
     if category_column not in df.columns:
         # Create a simple figure with a message
         fig, ax = plt.subplots(figsize=(10, 6))
+        ax.text(
+            0.5, 0.5, "No categories to display", ha="center", va="center", fontsize=12
+        )
+        ax.set_title("No Classification Results Available")
         plt.tight_layout()
         return fig
     # Get categories and their counts
     category_counts = df[category_column].value_counts()
     # Create a new figure
     fig, ax = plt.subplots(figsize=(10, 6))
     # Create the histogram
     bars = ax.bar(category_counts.index, category_counts.values)
     # Add value labels on top of each bar
     for bar in bars:
         height = bar.get_height()
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            height,
+            f"{int(height)}",
+            ha="center",
+            va="bottom",
+        )
     # Customize the plot
+    ax.set_xlabel("Categories")
+    ax.set_ylabel("Number of Texts")
+    ax.set_title("Distribution of Classified Texts")
     # Rotate x-axis labels if they're too long
+    plt.xticks(rotation=45, ha="right")
     # Add grid
+    ax.grid(True, linestyle="--", alpha=0.7)
     plt.tight_layout()
     return fig
 def validate_results(df, text_columns, client):
     """
     Use LLM to validate the classification results
     Args:
         df (pd.DataFrame): Dataframe with classification results
         text_columns (list): List of column names containing text data
         client: LiteLLM client
     Returns:
         str: Validation report
     """
         # Sample a few rows for validation
         sample_size = min(5, len(df))
         sample_df = df.sample(n=sample_size, random_state=42)
         # Build validation prompt
         validation_prompts = []
         for _, row in sample_df.iterrows():
             text = " ".join(str(row[col]) for col in text_columns)
             assigned_category = row["Category"]
             confidence = row["Confidence"]
             validation_prompts.append(
                 f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
             )
         prompt = """
         As a validation expert, review the following text classifications and provide feedback.
         For each text, assess whether the assigned category seems appropriate:
         3. Suggestions for improvement
         Keep your response under 300 words.
+        """.format(
+            "\n---\n".join(validation_prompts)
+        )
         # Call LLM API
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.3,
+            max_tokens=400,
         )
         validation_report = response.choices[0].message.content.strip()
         return validation_report
     except Exception as e:
         return f"Validation failed: {str(e)}"
 def create_example_file():
     """
     Create an example CSV file for testing
     Returns:
         str: Path to the created file
     """
             "It's okay, nothing special but gets the job done.",
             "I'm extremely disappointed with the quality of this product.",
             "This is the best purchase I've made all year!",
+            "It's reasonably priced and works as expected.",
         ]
     }
     # Create dataframe
     df = pd.DataFrame(data)
     # Save to a CSV file
     example_dir = "examples"
     os.makedirs(example_dir, exist_ok=True)
     file_path = os.path.join(example_dir, "sample_reviews.csv")
     df.to_csv(file_path, index=False)
     return file_path