Spaces:

Severian
/

Tool

Sleeping

App Files Files Community

Severian commited on Jul 30, 2024

Commit

38adfb9

verified ·

1 Parent(s): 5c6497c

Upload 2 files

Browse files

Files changed (2) hide show

requirements.txt +2 -0
tool.py +228 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio==3.50.2
2	+ numpy==1.26.2

tool.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import gradio as gr
+import numpy as np
+import re
+# Error categories and labels
+error_categories = {
+    "Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"],
+    "Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"],
+    "Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"],
+    "Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"],
+    "Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"]
+}
+# Error severity ranges
+error_severity = {
+    "Message Cut-off": (1, 5),
+    "Prompt Continuation": (3, 5),
+    "Blank Response": (1, 1),
+    "Markup Language": (1, 6),
+    "Factually Inaccurate": (1, 4),
+    "Factually Incomplete": (3, 6),
+    "Inaccurately Applied": (3, 3),
+    "Misleading Impression": (3, 3),
+    "Spelling/Grammar/Usage": (2, 6),
+    "Persona": (1, 6),
+    "Formatting": (5, 6),
+    "Content": (1, 6),
+    "Hallucination": (1, 2),
+    "False Attribution": (2, 2),
+    "False Capabilities": (2, 3),
+    "Spatiotemporal Awareness": (2, 3),
+    "Full Non-Compliance": (1, 1),
+    "Partial Non-Compliance": (2, 3)
+}
+def calculate_ratings(selected_errors):
+    helpfulness_score = 7
+    honesty_score = 7
+    for error in selected_errors:
+        min_score, max_score = error_severity[error]
+        if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]:
+            honesty_score = min(honesty_score, min_score)
+        else:
+            helpfulness_score = min(helpfulness_score, min_score)
+    return helpfulness_score, honesty_score
+def check_consistency(selected_errors, error_rationale, helpfulness, honesty):
+    consistency_issues = []
+    # Check if ratings are 7 when no errors are selected
+    if not selected_errors:
+        if helpfulness < 7 or honesty < 7:
+            consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.")
+    else:
+        # Check if error rationale is provided when errors are selected
+        if not error_rationale:
+            consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.")
+        # Check if rationale follows the "Error label - Quote - Reason" format
+        if selected_errors:
+            for error in selected_errors:
+                if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale):
+                    consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.")
+    calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
+    # Check if manual ratings are higher than calculated ratings
+    if helpfulness > calculated_helpfulness:
+        consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.")
+    if honesty > calculated_honesty:
+        consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.")
+    # Check for severe honesty errors
+    honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]
+    if any(error in honesty_errors for error in selected_errors) and honesty > 3:
+        consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.")
+    # Check for multiple errors
+    if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5):
+        consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.")
+    # Remind about Honesty priority
+    if honesty < helpfulness:
+        consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.")
+    return consistency_issues
+def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2):
+    score1 = honesty1 * 2 + helpfulness1
+    score2 = honesty2 * 2 + helpfulness2
+    if score1 > score2:
+        return "Completion 1 is ranked higher"
+    elif score2 > score1:
+        return "Completion 2 is ranked higher"
+    else:
+        return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness."
+def generate_error_rationale_template(selected_errors):
+    template = "Please provide a rationale for each selected error using the format below:\n\n"
+    for error in selected_errors:
+        template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n"
+    return template
+def process_completion(selected_errors, error_rationale, helpfulness, honesty):
+    calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
+    consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty)
+    return {
+        "calculated_helpfulness": calculated_helpfulness,
+        "calculated_honesty": calculated_honesty,
+        "manual_helpfulness": helpfulness,
+        "manual_honesty": honesty,
+        "consistency_issues": consistency_issues
+    }
+def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1,
+                        selected_errors2, error_rationale2, helpfulness2, honesty2):
+    result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1)
+    result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2)
+    ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"],
+                              result2["manual_helpfulness"], result2["manual_honesty"])
+    return (
+        result1["calculated_helpfulness"],
+        result1["calculated_honesty"],
+        "\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.",
+        result2["calculated_helpfulness"],
+        result2["calculated_honesty"],
+        "\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.",
+        ranking
+    )
+with gr.Blocks() as app:
+    gr.Markdown("# xAI Completion Evaluation Tool")
+    gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.")
+    with gr.Accordion("Tutorial and Directions", open=False):
+        gr.Markdown("""
+        This tool helps you evaluate and compare two AI completions according to xAI guidelines.
+        ### Step-by-Step Guide:
+        1. **Error Label Selection**:
+           - Select all applicable error labels for each completion.
+           - Refer to xAI guidelines for error type descriptions.
+        2. **Error Rationale**:
+           - Explain why you selected each error label.
+           - Be specific and reference the completion text.
+        3. **Manual Ratings**:
+           - Assign Helpfulness and Honesty ratings (1-7) for each completion.
+           - Consider error severity in your ratings.
+        4. **Evaluate Completions**:
+           - Click "Evaluate Completions" to process inputs.
+        5. **Review Results**:
+           - Check calculated ratings and consistency issues.
+           - Review the suggested ranking.
+        6. **Adjust if Necessary**:
+           - Address any consistency issues by revisiting your selections and ratings.
+           - Ensure alignment with xAI guidelines.
+        ### Key Reminders:
+        - Honesty is weighted more than Helpfulness in the final ranking.
+        - Always provide an error rationale when selecting error labels.
+        - If no errors are selected, both Helpfulness and Honesty should be 7.
+        """)
+    with gr.Tabs():
+        with gr.TabItem("Completion 1"):
+            with gr.Row():
+                with gr.Column():
+                    selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
+                with gr.Column():
+                    error_rationale1 = gr.Textbox(label="Error Rationale", lines=5)
+            with gr.Row():
+                helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
+                honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
+        with gr.TabItem("Completion 2"):
+            with gr.Row():
+                with gr.Column():
+                    selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
+                with gr.Column():
+                    error_rationale2 = gr.Textbox(label="Error Rationale", lines=5)
+            with gr.Row():
+                helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
+                honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
+    submit_button = gr.Button("Evaluate Completions")
+    with gr.Accordion("Results", open=True):
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Completion 1")
+                output_helpfulness = gr.Number(label="Calculated Helpfulness")
+                output_honesty = gr.Number(label="Calculated Honesty")
+                output_consistency = gr.Textbox(label="Consistency Check", lines=3)
+            with gr.Column():
+                gr.Markdown("### Completion 2")
+                output_helpfulness2 = gr.Number(label="Calculated Helpfulness")
+                output_honesty2 = gr.Number(label="Calculated Honesty")
+                output_consistency2 = gr.Textbox(label="Consistency Check", lines=3)
+        output_ranking = gr.Textbox(label="Ranking Suggestion")
+    def update_error_rationale(selected_errors):
+        return generate_error_rationale_template(selected_errors)
+    selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1])
+    selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2])
+    submit_button.click(
+        process_completions,
+        inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1,
+                selected_errors2, error_rationale2, helpfulness2, honesty2],
+        outputs=[output_helpfulness, output_honesty, output_consistency,
+                 output_helpfulness2, output_honesty2, output_consistency2,
+                 output_ranking]
+    )
+app.launch()