Spaces:

gourisankar85
/

rag-bench-evaluation

Sleeping

App Files Files Community

gourisankar85 commited on Feb 14

Commit

efb5c9e

verified ·

1 Parent(s): 870f650

Upload 3 files

Browse files

Files changed (3) hide show

app.py +195 -169
config.json +2 -2
main.py +25 -56

app.py CHANGED Viewed

@@ -1,181 +1,207 @@
 import gradio as gr
-import logging
-import threading
-import time
-from generator.compute_metrics import get_attributes_text
-from generator.generate_metrics import generate_metrics, retrieve_and_generate_response
-from config import AppConfig, ConfigConstants
-from generator.initialize_llm import initialize_generation_llm, initialize_validation_llm
-def launch_gradio(config : AppConfig):
-    """
-    Launch the Gradio app with pre-initialized objects.
-    """
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    # Create a list to store logs
-    logs = []
-    # Custom log handler to capture logs and add them to the logs list
-    class LogHandler(logging.Handler):
-        def emit(self, record):
-            log_entry = self.format(record)
-            logs.append(log_entry)
-    # Add custom log handler to the logger
-    log_handler = LogHandler()
-    log_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
-    logger.addHandler(log_handler)
-    def log_updater():
-        """Background function to add logs."""
-        while True:
-            time.sleep(2)  # Update logs every 2 seconds
-            pass  # Log capture is now handled by the logging system
-    def get_logs():
-        """Retrieve logs for display."""
-        return "\n".join(logs[-50:])  # Only show the last 50 logs for example
-    # Start the logging thread
-    threading.Thread(target=log_updater, daemon=True).start()
-    def answer_question(query, state):
-        try:
-            # Generate response using the passed objects
-            response, source_docs = retrieve_and_generate_response(config.gen_llm, config.vector_store, query)
-            # Update state with the response and source documents
-            state["query"] = query
-            state["response"] = response
-            state["source_docs"] = source_docs
-            response_text = f"Response: {response}\n\n"
-            return response_text, state
-        except Exception as e:
-            logging.error(f"Error processing query: {e}")
-            return f"An error occurred: {e}", state
-    def compute_metrics(state):
-        try:
-            logging.info(f"Computing metrics")
-            # Retrieve response and source documents from state
-            response = state.get("response", "")
-            source_docs = state.get("source_docs", {})
-            query = state.get("query", "")
-            # Generate metrics using the passed objects
-            attributes, metrics = generate_metrics(config.val_llm, response, source_docs, query, 1)
-            attributes_text = get_attributes_text(attributes)
-            metrics_text = "Metrics:\n"
-            for key, value in metrics.items():
-                if key != 'response':
-                    metrics_text += f"{key}: {value}\n"
-            return attributes_text, metrics_text
-        except Exception as e:
-            logging.error(f"Error computing metrics: {e}")
-            return f"An error occurred: {e}", ""
-    def reinitialize_gen_llm(gen_llm_name):
-        """Reinitialize the generation LLM and return updated model info."""
-        if gen_llm_name.strip():  # Only update if input is not empty
-            config.gen_llm = initialize_generation_llm(gen_llm_name)
-        # Return updated model information
-        updated_model_info = (
-            f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
-            f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
-            f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
-        )
-        return updated_model_info
-    def reinitialize_val_llm(val_llm_name):
-        """Reinitialize the generation LLM and return updated model info."""
-        if val_llm_name.strip():  # Only update if input is not empty
-            config.val_llm = initialize_validation_llm(val_llm_name)
-        # Return updated model information
-        updated_model_info = (
-            f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
-            f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
-            f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
-        )
-        return updated_model_info
-    # Define Gradio Blocks layout
-    with gr.Blocks() as interface:
-        interface.title = "Real Time RAG Pipeline Q&A"
-        gr.Markdown("### Real Time RAG Pipeline Q&A")  # Heading
-        # Textbox for new generation LLM name
         with gr.Row():
-            new_gen_llm_input = gr.Textbox(label="New Generation LLM Name", placeholder="Enter LLM name to update")
-            update_gen_llm_button = gr.Button("Update Generation LLM")
-            new_val_llm_input = gr.Textbox(label="New Validation LLM Name", placeholder="Enter LLM name to update")
-            update_val_llm_button = gr.Button("Update Validation LLM")
-        # Section to display LLM names
         with gr.Row():
-            model_info = f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
-            model_info += f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
-            model_info += f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
-            model_info_display = gr.Textbox(value=model_info, label="Model Information", interactive=False)  # Read-only textbox
-        # State to store response and source documents
-        state = gr.State(value={"query": "","response": "", "source_docs": {}})
-        gr.Markdown("Ask a question and get a response with metrics calculated from the RAG pipeline.")  # Description
         with gr.Row():
-            query_input = gr.Textbox(label="Ask a question", placeholder="Type your query here")
         with gr.Row():
-            submit_button = gr.Button("Submit", variant="primary")  # Submit button
-            clear_query_button = gr.Button("Clear")  # Clear button
         with gr.Row():
-            answer_output = gr.Textbox(label="Response", placeholder="Response will appear here")
-        with gr.Row():
-            compute_metrics_button = gr.Button("Compute metrics", variant="primary")
-            attr_output = gr.Textbox(label="Attributes", placeholder="Attributes will appear here")
-            metrics_output = gr.Textbox(label="Metrics", placeholder="Metrics will appear here")
-        #with gr.Row():
-        # Define button actions
-        submit_button.click(
-            fn=answer_question,
-            inputs=[query_input, state],
-            outputs=[answer_output, state]
-        )
-        clear_query_button.click(fn=lambda: "", outputs=[query_input])  # Clear query input
-        compute_metrics_button.click(
-            fn=compute_metrics,
-            inputs=[state],
-            outputs=[attr_output, metrics_output]
-        )
-        update_gen_llm_button.click(
-            fn=reinitialize_gen_llm,
-            inputs=[new_gen_llm_input],
-            outputs=[model_info_display]  # Update the displayed model info
-        )
-        update_val_llm_button.click(
-            fn=reinitialize_val_llm,
-            inputs=[new_val_llm_input],
-            outputs=[model_info_display]  # Update the displayed model info
-        )
-        # Section to display logs
-        with gr.Row():
-            start_log_button = gr.Button("Start Log Update", elem_id="start_btn")  # Button to start log updates
-        with gr.Row():
-            log_section = gr.Textbox(label="Logs", interactive=False, visible=True, lines=10)  # Log section
-        # Set button click to trigger log updates
-        start_log_button.click(fn=get_logs, outputs=log_section)
-    interface.launch()

 import gradio as gr
+import os
+import json
+import pandas as pd
+from scripts.evaluate_information_integration import evaluate_information_integration
+from scripts.evaluate_negative_rejection import evaluate_negative_rejection
+from scripts.helper import update_config
+from scripts.evaluate_noise_robustness import evaluate_noise_robustness
+from scripts.evaluate_factual_robustness import evaluate_factual_robustness
+# Path to score files
+Noise_Robustness_DIR = "results/Noise Robustness/"
+Negative_Rejection_DIR = "results/Negative Rejection/"
+Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
+Infomration_Integration_DIR = "results/Information Integration/"
+# Function to read and aggregate score data
+def load_scores(file_dir):
+    models = set()
+    noise_rates = set()
+    if not os.path.exists(file_dir):
+        return pd.DataFrame(columns=["Noise Ratio"])
+    score_data = {}
+    # Read all JSON score files
+    for filename in os.listdir(file_dir):
+        if filename.startswith("scores_") and filename.endswith(".json"):
+            filepath = os.path.join(file_dir, filename)
+            with open(filepath, "r") as f:
+                score = json.load(f)
+                model = score["model"]
+                noise_rate = str(score['noise_rate'])
+                models.add(model)
+                noise_rates.add(noise_rate)
+                score_data[(model, noise_rate)] = score["accuracy"]
+    # Convert to DataFrame
+    df = pd.DataFrame([
+        {
+            "Noise Ratio": model,
+            **{
+                rate: f"{score_data.get((model, rate), 'N/A') * 100:.2f}"
+                if score_data.get((model, rate), "N/A") != "N/A"
+                else "N/A"
+                for rate in sorted(noise_rates, key=float)
+            }
+        }
+        for model in sorted(models)
+    ])
+    return df
+# Function to load Negative Rejection scores (Only for Noise Rate = 1.0)
+def load_negative_rejection_scores():
+    if not os.path.exists(Negative_Rejection_DIR):
+        return pd.DataFrame()
+    score_data = {}
+    models = set()
+    for filename in os.listdir(Negative_Rejection_DIR):
+        if filename.startswith("scores_") and filename.endswith(".json") and "_noise_1.0_" in filename:
+            filepath = os.path.join(Negative_Rejection_DIR, filename)
+            with open(filepath, "r") as f:
+                score = json.load(f)
+                model = filename.split("_")[1]  # Extract model name
+                models.add(model)
+                score_data[model] = score.get("reject_rate", "N/A")
+    df = pd.DataFrame([
+        {"Model": model, "Rejection Rate": f"{score_data.get(model, 'N/A') * 100:.2f}%"
+         if score_data.get(model, "N/A") != "N/A"
+         else "N/A"}
+        for model in sorted(models)
+    ])
+    return df if not df.empty else pd.DataFrame(columns=["Model", "Rejection Rate"])
+def load_counterfactual_robustness_scores():
+    models = set()
+    if not os.path.exists(Counterfactual_Robustness_DIR):
+        return pd.DataFrame(columns=["Noise Ratio"])
+    score_data = {}
+    # Read all JSON score files
+    for filename in os.listdir(Counterfactual_Robustness_DIR):
+        if filename.startswith("scores_") and filename.endswith(".json"):
+            filepath = os.path.join(Counterfactual_Robustness_DIR, filename)
+            with open(filepath, "r") as f:
+                score = json.load(f)
+                model = filename.split("_")[1]
+                models.add(model)
+                score_data[model] = {
+                    "Accuracy (%)": int(score["all_rate"] * 100),  # No decimal
+                    "Error Detection Rate": int(score["reject_rate"] * 10),
+                    "Correction Rate (%)": round(score["correct_rate"] * 100, 2)  # 2 decimal places
+                }
+    # Convert to DataFrame
+    df = pd.DataFrame([
+         {
+            "Model": model,
+            "Accuracy (%)": score_data.get(model, {}).get("Accuracy (%)", "N/A"),
+            "Error Detection Rate": score_data.get(model, {}).get("Error Detection Rate", "N/A"),
+            "Correction Rate (%)": f"{score_data.get(model, {}).get('Correction Rate (%)', 'N/A'):.2f}"
+        }
+        for model in sorted(models)
+    ])
+    return df
+# Gradio UI
+def launch_gradio_app(config):
+    with gr.Blocks() as app:
+        app.title = "RAG System Evaluation"
+        gr.Markdown("# RAG System Evaluation on RGB Dataset")
+        # Top Section - Inputs and Controls
         with gr.Row():
+            model_name_input = gr.Dropdown(
+            label="Model Name",
+            choices= config["models"],
+            value="llama3-8b-8192",
+            interactive=True
+            )
+            noise_rate_input = gr.Slider(label="Noise Rate", minimum=0, maximum=1.0, step=0.2, value=0.2, interactive=True)
+            num_queries_input = gr.Number(label="Number of Queries", value=50, interactive=True)
+        # Bottom Section - Action Buttons
         with gr.Row():
+            recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
+            recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
+            recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
+            recalculate_integration_btn = gr.Button("Evaluate Integration Information")
         with gr.Row():
+            refresh_btn = gr.Button("Refresh", variant="primary", scale = 0)
+        # Middle Section - Data Tables
         with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 📊 Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios. Result show that the increasing noise rate poses a challenge for RAG in LLMs.")
+                noise_table = gr.Dataframe(value=load_scores(Noise_Robustness_DIR), interactive=False)
+            with gr.Column():
+                gr.Markdown("### 🚫 Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries instead of generating incorrect responses. A higher rejection rate means the model is better at filtering unreliable inputs.")
+                rejection_table = gr.Dataframe(value=load_negative_rejection_scores(), interactive=False)
         with gr.Row():
+            with gr.Column():
+                gr.Markdown("""
+                    ### 🔄 Counterfactual Robustness
+                    **Description:**
+                    Counterfactual Robustness evaluates a model's ability to handle **errors in external knowledge** while ensuring reliable responses.
+                    **Key Metrics in this Report:**
+                    - **Accuracy (%)** → Measures the accuracy (%) of LLMs with counterfactual documents.
+                    - **Error Detection Rate (%)** → Measures how often the model **rejects** incorrect or misleading queries instead of responding.
+                    - **Correct Rate (%)** → Measures how often the model provides accurate responses despite **potential misinformation**.
+                    """)
+                counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(), interactive=False)
+            with gr.Column():
+                gr.Markdown("### 🧠 Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios. The result show that information integration poses a challenge for RAG in LLMs")
+                integration_table = gr.Dataframe(value=load_scores(Infomration_Integration_DIR), interactive=False)
+        # Refresh Scores Function
+        def refresh_scores():
+            return load_scores(Noise_Robustness_DIR), load_negative_rejection_scores(), load_counterfactual_robustness_scores(), load_scores(Infomration_Integration_DIR)
+        refresh_btn.click(refresh_scores, outputs=[noise_table, rejection_table, counter_factual_table, integration_table])
+        # Button Functions
+        def recalculate_noise_robustness(model_name, noise_rate, num_queries):
+            update_config(config, model_name, noise_rate, num_queries)
+            evaluate_noise_robustness(config)
+            return load_scores(Noise_Robustness_DIR)
+        recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])
+        def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
+            update_config(config, model_name, noise_rate, num_queries)
+            evaluate_factual_robustness(config)
+            return load_counterfactual_robustness_scores()
+        recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])
+        def recalculate_negative_rejection(model_name, noise_rate, num_queries):
+            update_config(config, model_name, noise_rate, num_queries)
+            evaluate_negative_rejection(config)
+            return load_negative_rejection_scores()
+        recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])
+        def recalculate_integration_info(model_name, noise_rate, num_queries):
+            update_config(config, model_name, noise_rate, num_queries)
+            evaluate_information_integration(config)
+            return load_scores(Infomration_Integration_DIR)
+        recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])
+    app.launch()

config.json CHANGED Viewed

@@ -3,11 +3,11 @@
     "factual_file_name":"en_fact.json",
     "integration_file_name":"en_int.json",
     "result_path": "results/",
-    "models": ["llama3-8b-8192","qwen-2.5-32b", "mixtral-8x7b-32768", "gemma2-9b-it", "deepseek-r1-distill-llama-70b" ],
     "model_name":"gemma2-9b-it",
     "noise_rate": 0.4,
     "passage_num": 5,
-    "num_queries": 10,
     "retry_attempts": 3,
     "timeout_limit": 60
 }

     "factual_file_name":"en_fact.json",
     "integration_file_name":"en_int.json",
     "result_path": "results/",
+    "models": ["llama3-8b-8192", "qwen-2.5-32b", "mixtral-8x7b-32768", "gemma2-9b-it", "deepseek-r1-distill-llama-70b" ],
     "model_name":"gemma2-9b-it",
     "noise_rate": 0.4,
     "passage_num": 5,
+    "num_queries": 50,
     "retry_attempts": 3,
     "timeout_limit": 60
 }

main.py CHANGED Viewed

@@ -1,64 +1,33 @@
 import logging
-from config import AppConfig, ConfigConstants
-from data.load_dataset import load_data
-from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
-from retriever.chunk_documents import chunk_documents
-from retriever.embed_documents import embed_documents
-from generator.initialize_llm import initialize_generation_llm
-from generator.initialize_llm import initialize_validation_llm
-from app import launch_gradio
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def main():
-    logging.info("Starting the RAG pipeline")
-    # Dictionary to store chunked documents
-    all_chunked_documents = []
-    datasets = {}
-    # Load multiple datasets
-    for data_set_name in ConfigConstants.DATA_SET_NAMES:
-        logging.info(f"Loading dataset: {data_set_name}")
-        datasets[data_set_name] = load_data(data_set_name)
-        # Set chunk size based on dataset name
-        chunk_size = ConfigConstants.DEFAULT_CHUNK_SIZE
-        if data_set_name == 'cuad':
-            chunk_size = 4000  # Custom chunk size for 'cuad'
-        # Chunk documents
-        chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=ConfigConstants.CHUNK_OVERLAP)
-        all_chunked_documents.extend(chunked_documents)  # Combine all chunks
-    # Access individual datasets
-    #for name, dataset in datasets.items():
-        #logging.info(f"Loaded {name} with {dataset.num_rows} rows")
-    # Logging final count
-    logging.info(f"Total chunked documents: {len(all_chunked_documents)}")
-    # Embed the documents
-    vector_store = embed_documents(all_chunked_documents)
-    logging.info("Documents embedded")
-     # Initialize the Generation LLM
-    gen_llm = initialize_generation_llm(ConfigConstants.GENERATION_MODEL_NAME)
-    # Initialize the Validation LLM
-    val_llm = initialize_validation_llm(ConfigConstants.VALIDATION_MODEL_NAME)
-    #Compute RMSE and AUC-ROC for entire dataset
-    #Enable below code for calculation
-    #data_set_name = 'covidqa'
-    #compute_rmse_auc_roc_metrics(gen_llm, val_llm, datasets[data_set_name], vector_store, 10)
-    # Launch the Gradio app
-    config = AppConfig(vector_store= vector_store, gen_llm = gen_llm, val_llm = val_llm)
-    launch_gradio(config)
-    logging.info("Finished!!!")
 if __name__ == "__main__":
-    main()

+import json
 import logging
+from app import launch_gradio_app
+from scripts.download_files import download_file, get_file_list
+def load_config(config_file="config.json"):
+    """Load configuration from the config file."""
+    try:
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return config
+    except Exception as e:
+        logging.info(f"Error loading config: {e}")
+        return {}
 def main():
+    # Load configuration
+    config = load_config()
+    logging.info(f"Model: {config['model_name']}")
+    logging.info(f"Noise Rate: {config['noise_rate']}")
+    logging.info(f"Passage Number: {config['passage_num']}")
+    logging.info(f"Number of Queries: {config['num_queries']}")
+    # Download files from the GitHub repository
+    files = get_file_list()
+    for file in files:
+        download_file(file)
+    launch_gradio_app(config)
 if __name__ == "__main__":
+    main()