Spaces:

gourisankar85
/

rag-bench-evaluation

Sleeping

App Files Files Community

gourisankar85 commited on Feb 14

Commit

870f650

verified ·

1 Parent(s): a1710c6

Upload 3 files

Browse files

Files changed (3) hide show

app.py +169 -197
config.py +16 -0
main.py +54 -33

app.py CHANGED Viewed

@@ -1,209 +1,181 @@
 import gradio as gr
-import os
-import json
-import pandas as pd
-from scripts.evaluate_information_integration import evaluate_information_integration
-from scripts.evaluate_negative_rejection import evaluate_negative_rejection
-from scripts.helper import update_config
-from scripts.evaluate_noise_robustness import evaluate_noise_robustness
-from scripts.evaluate_factual_robustness import evaluate_factual_robustness
-# Path to score files
-Noise_Robustness_DIR = "results/Noise Robustness/"
-Negative_Rejection_DIR = "results/Negative Rejection/"
-Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
-Infomration_Integration_DIR = "results/Information Integration/"
-# Function to read and aggregate score data
-def load_scores(file_dir):
-    models = set()
-    noise_rates = set()
-    if not os.path.exists(file_dir):
-        return pd.DataFrame(columns=["Noise Ratio"])
-    score_data = {}
-    # Read all JSON score files
-    for filename in os.listdir(file_dir):
-        if filename.startswith("scores_") and filename.endswith(".json"):
-            filepath = os.path.join(file_dir, filename)
-            with open(filepath, "r") as f:
-                score = json.load(f)
-                model = score["model"]
-                noise_rate = str(score["noise_rate"])
-                models.add(model)
-                noise_rates.add(noise_rate)
-                score_data[(model, noise_rate)] = score["accuracy"]
-    # Convert to DataFrame
-    df = pd.DataFrame([
-        {
-            "Noise Ratio": model,
-            **{
-                rate: f"{score_data.get((model, rate), 'N/A') * 100:.2f}"
-                if score_data.get((model, rate), "N/A") != "N/A"
-                else "N/A"
-                for rate in sorted(noise_rates, key=float)
-            }
-        }
-        for model in sorted(models)
-    ])
-    return df
-# Function to load Negative Rejection scores (Only for Noise Rate = 1.0)
-def load_negative_rejection_scores():
-    if not os.path.exists(Negative_Rejection_DIR):
-        return pd.DataFrame()
-    score_data = {}
-    models = set()
-    for filename in os.listdir(Negative_Rejection_DIR):
-        if filename.startswith("scores_") and filename.endswith(".json") and "_noise_1.0_" in filename:
-            filepath = os.path.join(Negative_Rejection_DIR, filename)
-            with open(filepath, "r") as f:
-                score = json.load(f)
-                model = filename.split("_")[1]  # Extract model name
-                models.add(model)
-                score_data[model] = score.get("reject_rate", "N/A")
-    df = pd.DataFrame([
-        {"Model": model, "Rejection Rate": f"{score_data.get(model, 'N/A') * 100:.2f}%"
-         if score_data.get(model, "N/A") != "N/A"
-         else "N/A"}
-        for model in sorted(models)
-    ])
-    return df if not df.empty else pd.DataFrame(columns=["Model", "Rejection Rate"])
-def load_counterfactual_robustness_scores():
-    models = set()
-    if not os.path.exists(Counterfactual_Robustness_DIR):
-        return pd.DataFrame(columns=["Noise Ratio"])
-    score_data = {}
-    # Read all JSON score files
-    for filename in os.listdir(Counterfactual_Robustness_DIR):
-        if filename.startswith("scores_") and filename.endswith(".json"):
-            filepath = os.path.join(Counterfactual_Robustness_DIR, filename)
-            with open(filepath, "r") as f:
-                score = json.load(f)
-                model = filename.split("_")[1]
-                #noise_rate = str(score["noise_rate"])
-                models.add(model)
-                score_data[model] = {
-                    "Accuracy (%)": int(score["all_rate"] * 100),  # No decimal
-                    "Error Detection Rate": int(score["reject_rate"] * 10),
-                    "Correction Rate (%)": round(score["correct_rate"] * 100, 2)  # 2 decimal places
-                }
-    # Convert to DataFrame
-    df = pd.DataFrame([
-         {
-            "Model": model,
-            "Accuracy (%)": score_data.get(model, {}).get("Accuracy (%)", "N/A"),
-            "Error Detection Rate": score_data.get(model, {}).get("Error Detection Rate", "N/A"),
-            "Correction Rate (%)": f"{score_data.get(model, {}).get('Correction Rate (%)', 'N/A'):.2f}"
-        }
-        for model in sorted(models)
-    ])
-    return df
-# Gradio UI
-def launch_gradio_app(config):
-    with gr.Blocks() as app:
-        app.title = "RAG System Evaluation"
-        gr.Markdown("# RAG System Evaluation on RGB Dataset")
-        # Top Section - Inputs and Controls
-        with gr.Row():
-            model_name_input = gr.Dropdown(
-            label="Model Name",
-            choices= config["models"],
-            value="llama3-8b-8192",
-            interactive=True
-            )
-            noise_rate_input = gr.Slider(label="Noise Rate", minimum=0, maximum=1.0, step=0.2, value=0.2, interactive=True)
-            num_queries_input = gr.Number(label="Number of Queries", value=50, interactive=True)
-        # Bottom Section - Action Buttons
         with gr.Row():
-            recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
-            recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
-            recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
-            recalculate_integration_btn = gr.Button("Evaluate Integration Information")
         with gr.Row():
-            refresh_btn = gr.Button("Refresh", variant="primary", scale = 0)
-        # Middle Section - Data Tables
         with gr.Row():
-            with gr.Column():
-                gr.Markdown("### 📊 Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios. Result show that the increasing noise rate poses a challenge for RAG in LLMs.")
-                noise_table = gr.Dataframe(value=load_scores(Noise_Robustness_DIR), interactive=False)
-            with gr.Column():
-                gr.Markdown("### 🚫 Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries instead of generating incorrect responses. A higher rejection rate means the model is better at filtering unreliable inputs.")
-                rejection_table = gr.Dataframe(value=load_negative_rejection_scores(), interactive=False)
         with gr.Row():
-            with gr.Column():
-                gr.Markdown("""
-                    ### 🔄 Counterfactual Robustness
-                    **Description:**
-                    Counterfactual Robustness evaluates a model's ability to handle **errors in external knowledge** while ensuring reliable responses.
-                    **Key Metrics in this Report:**
-                    - **Accuracy (%)** → Measures the accuracy (%) of LLMs with counterfactual documents.
-                    - **Error Detection Rate (%)** → Measures how often the model **rejects** incorrect or misleading queries instead of responding.
-                    - **Correct Rate (%)** → Measures how often the model provides accurate responses despite **potential misinformation**.
-                    """)
-                counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(), interactive=False)
-            with gr.Column():
-                gr.Markdown("### 🧠 Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios. The result show that information integration poses a challenge for RAG in LLMs")
-                integration_table = gr.Dataframe(value=load_scores(Infomration_Integration_DIR), interactive=False)
-        # Refresh Scores Function
-        def refresh_scores():
-            return load_scores(Noise_Robustness_DIR), load_negative_rejection_scores(), load_counterfactual_robustness_scores(), load_scores(Infomration_Integration_DIR)
-        refresh_btn.click(refresh_scores, outputs=[noise_table, rejection_table, counter_factual_table, integration_table])
-        # Button Functions
-        def recalculate_noise_robustness(model_name, noise_rate, num_queries):
-            update_config(config, model_name, noise_rate, num_queries)
-            evaluate_noise_robustness(config)
-            return load_scores(Noise_Robustness_DIR)
-        recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])
-        def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
-            update_config(config, model_name, noise_rate, num_queries)
-            evaluate_factual_robustness(config)
-            return load_counterfactual_robustness_scores()
-        recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])
-        def recalculate_negative_rejection(model_name, noise_rate, num_queries):
-            update_config(config, model_name, noise_rate, num_queries)
-            evaluate_negative_rejection(config)
-            return load_negative_rejection_scores()
-        recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])
-        def recalculate_integration_info(model_name, noise_rate, num_queries):
-            update_config(config, model_name, noise_rate, num_queries)
-            evaluate_information_integration(config)
-            return load_scores(Infomration_Integration_DIR)
-        recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])
-    app.launch()

 import gradio as gr
+import logging
+import threading
+import time
+from generator.compute_metrics import get_attributes_text
+from generator.generate_metrics import generate_metrics, retrieve_and_generate_response
+from config import AppConfig, ConfigConstants
+from generator.initialize_llm import initialize_generation_llm, initialize_validation_llm
+def launch_gradio(config : AppConfig):
+    """
+    Launch the Gradio app with pre-initialized objects.
+    """
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    # Create a list to store logs
+    logs = []
+    # Custom log handler to capture logs and add them to the logs list
+    class LogHandler(logging.Handler):
+        def emit(self, record):
+            log_entry = self.format(record)
+            logs.append(log_entry)
+    # Add custom log handler to the logger
+    log_handler = LogHandler()
+    log_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
+    logger.addHandler(log_handler)
+    def log_updater():
+        """Background function to add logs."""
+        while True:
+            time.sleep(2)  # Update logs every 2 seconds
+            pass  # Log capture is now handled by the logging system
+    def get_logs():
+        """Retrieve logs for display."""
+        return "\n".join(logs[-50:])  # Only show the last 50 logs for example
+    # Start the logging thread
+    threading.Thread(target=log_updater, daemon=True).start()
+    def answer_question(query, state):
+        try:
+            # Generate response using the passed objects
+            response, source_docs = retrieve_and_generate_response(config.gen_llm, config.vector_store, query)
+            # Update state with the response and source documents
+            state["query"] = query
+            state["response"] = response
+            state["source_docs"] = source_docs
+            response_text = f"Response: {response}\n\n"
+            return response_text, state
+        except Exception as e:
+            logging.error(f"Error processing query: {e}")
+            return f"An error occurred: {e}", state
+    def compute_metrics(state):
+        try:
+            logging.info(f"Computing metrics")
+            # Retrieve response and source documents from state
+            response = state.get("response", "")
+            source_docs = state.get("source_docs", {})
+            query = state.get("query", "")
+            # Generate metrics using the passed objects
+            attributes, metrics = generate_metrics(config.val_llm, response, source_docs, query, 1)
+            attributes_text = get_attributes_text(attributes)
+            metrics_text = "Metrics:\n"
+            for key, value in metrics.items():
+                if key != 'response':
+                    metrics_text += f"{key}: {value}\n"
+            return attributes_text, metrics_text
+        except Exception as e:
+            logging.error(f"Error computing metrics: {e}")
+            return f"An error occurred: {e}", ""
+    def reinitialize_gen_llm(gen_llm_name):
+        """Reinitialize the generation LLM and return updated model info."""
+        if gen_llm_name.strip():  # Only update if input is not empty
+            config.gen_llm = initialize_generation_llm(gen_llm_name)
+        # Return updated model information
+        updated_model_info = (
+            f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
+            f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
+            f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
+        )
+        return updated_model_info
+    def reinitialize_val_llm(val_llm_name):
+        """Reinitialize the generation LLM and return updated model info."""
+        if val_llm_name.strip():  # Only update if input is not empty
+            config.val_llm = initialize_validation_llm(val_llm_name)
+        # Return updated model information
+        updated_model_info = (
+            f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
+            f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
+            f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
+        )
+        return updated_model_info
+    # Define Gradio Blocks layout
+    with gr.Blocks() as interface:
+        interface.title = "Real Time RAG Pipeline Q&A"
+        gr.Markdown("### Real Time RAG Pipeline Q&A")  # Heading
+        # Textbox for new generation LLM name
         with gr.Row():
+            new_gen_llm_input = gr.Textbox(label="New Generation LLM Name", placeholder="Enter LLM name to update")
+            update_gen_llm_button = gr.Button("Update Generation LLM")
+            new_val_llm_input = gr.Textbox(label="New Validation LLM Name", placeholder="Enter LLM name to update")
+            update_val_llm_button = gr.Button("Update Validation LLM")
+        # Section to display LLM names
         with gr.Row():
+            model_info = f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
+            model_info += f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
+            model_info += f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
+            model_info_display = gr.Textbox(value=model_info, label="Model Information", interactive=False)  # Read-only textbox
+        # State to store response and source documents
+        state = gr.State(value={"query": "","response": "", "source_docs": {}})
+        gr.Markdown("Ask a question and get a response with metrics calculated from the RAG pipeline.")  # Description
         with gr.Row():
+            query_input = gr.Textbox(label="Ask a question", placeholder="Type your query here")
         with gr.Row():
+            submit_button = gr.Button("Submit", variant="primary")  # Submit button
+            clear_query_button = gr.Button("Clear")  # Clear button
+        with gr.Row():
+            answer_output = gr.Textbox(label="Response", placeholder="Response will appear here")
+        with gr.Row():
+            compute_metrics_button = gr.Button("Compute metrics", variant="primary")
+            attr_output = gr.Textbox(label="Attributes", placeholder="Attributes will appear here")
+            metrics_output = gr.Textbox(label="Metrics", placeholder="Metrics will appear here")
+        #with gr.Row():
+        # Define button actions
+        submit_button.click(
+            fn=answer_question,
+            inputs=[query_input, state],
+            outputs=[answer_output, state]
+        )
+        clear_query_button.click(fn=lambda: "", outputs=[query_input])  # Clear query input
+        compute_metrics_button.click(
+            fn=compute_metrics,
+            inputs=[state],
+            outputs=[attr_output, metrics_output]
+        )
+        update_gen_llm_button.click(
+            fn=reinitialize_gen_llm,
+            inputs=[new_gen_llm_input],
+            outputs=[model_info_display]  # Update the displayed model info
+        )
+        update_val_llm_button.click(
+            fn=reinitialize_val_llm,
+            inputs=[new_val_llm_input],
+            outputs=[model_info_display]  # Update the displayed model info
+        )
+        # Section to display logs
+        with gr.Row():
+            start_log_button = gr.Button("Start Log Update", elem_id="start_btn")  # Button to start log updates
+        with gr.Row():
+            log_section = gr.Textbox(label="Logs", interactive=False, visible=True, lines=10)  # Log section
+        # Set button click to trigger log updates
+        start_log_button.click(fn=get_logs, outputs=log_section)
+    interface.launch()

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+class ConfigConstants:
+    # Constants related to datasets and models
+    DATA_SET_NAMES = ['covidqa', 'cuad', 'techqa']#, 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa']
+    EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
+    GENERATION_MODEL_NAME = 'mixtral-8x7b-32768'
+    VALIDATION_MODEL_NAME = 'llama3-70b-8192'
+    DEFAULT_CHUNK_SIZE = 1000
+    CHUNK_OVERLAP = 200
+class AppConfig:
+    def __init__(self, vector_store, gen_llm, val_llm):
+        self.vector_store = vector_store
+        self.gen_llm = gen_llm
+        self.val_llm = val_llm

main.py CHANGED Viewed

@@ -1,43 +1,64 @@
-import json
 import logging
-from app import launch_gradio_app
-from scripts.evaluate_factual_robustness import evaluate_factual_robustness
-from scripts.evaluate_negative_rejection import evaluate_negative_rejection
-from scripts.evaluate_noise_robustness import evaluate_noise_robustness
-from scripts.download_files import download_file, get_file_list
-def load_config(config_file="config.json"):
-    """Load configuration from the config file."""
-    try:
-        with open(config_file, "r", encoding="utf-8") as f:
-            config = json.load(f)
-        return config
-    except Exception as e:
-        logging.info(f"Error loading config: {e}")
-        return {}
 def main():
-    # Load configuration
-    config = load_config()
-    logging.info(f"Model: {config["model_name"]}")
-    logging.info(f"Noise Rate: {config["noise_rate"]}")
-    logging.info(f"Passage Number: {config["passage_num"]}")
-    logging.info(f"Number of Queries: {config["num_queries"]}")
-    # Download files from the GitHub repository
-    files = get_file_list()
-    for file in files:
-        download_file(file)
-    # Load dataset from the local JSON file
-    # Call evaluate_noise_robustness for each noise rate and model
-    #evaluate_noise_robustness(config)
-    #evaluate_negative_rejection(config)
-    #evaluate_factual_robustness(config)
-    launch_gradio_app(config)
 if __name__ == "__main__":
-    main()

 import logging
+from config import AppConfig, ConfigConstants
+from data.load_dataset import load_data
+from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
+from retriever.chunk_documents import chunk_documents
+from retriever.embed_documents import embed_documents
+from generator.initialize_llm import initialize_generation_llm
+from generator.initialize_llm import initialize_validation_llm
+from app import launch_gradio
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def main():
+    logging.info("Starting the RAG pipeline")
+    # Dictionary to store chunked documents
+    all_chunked_documents = []
+    datasets = {}
+    # Load multiple datasets
+    for data_set_name in ConfigConstants.DATA_SET_NAMES:
+        logging.info(f"Loading dataset: {data_set_name}")
+        datasets[data_set_name] = load_data(data_set_name)
+        # Set chunk size based on dataset name
+        chunk_size = ConfigConstants.DEFAULT_CHUNK_SIZE
+        if data_set_name == 'cuad':
+            chunk_size = 4000  # Custom chunk size for 'cuad'
+        # Chunk documents
+        chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=ConfigConstants.CHUNK_OVERLAP)
+        all_chunked_documents.extend(chunked_documents)  # Combine all chunks
+    # Access individual datasets
+    #for name, dataset in datasets.items():
+        #logging.info(f"Loaded {name} with {dataset.num_rows} rows")
+    # Logging final count
+    logging.info(f"Total chunked documents: {len(all_chunked_documents)}")
+    # Embed the documents
+    vector_store = embed_documents(all_chunked_documents)
+    logging.info("Documents embedded")
+     # Initialize the Generation LLM
+    gen_llm = initialize_generation_llm(ConfigConstants.GENERATION_MODEL_NAME)
+    # Initialize the Validation LLM
+    val_llm = initialize_validation_llm(ConfigConstants.VALIDATION_MODEL_NAME)
+    #Compute RMSE and AUC-ROC for entire dataset
+    #Enable below code for calculation
+    #data_set_name = 'covidqa'
+    #compute_rmse_auc_roc_metrics(gen_llm, val_llm, datasets[data_set_name], vector_store, 10)
+    # Launch the Gradio app
+    config = AppConfig(vector_store= vector_store, gen_llm = gen_llm, val_llm = val_llm)
+    launch_gradio(config)
+    logging.info("Finished!!!")
 if __name__ == "__main__":
+    main()