import gradio as gr import numpy as np import re # Error categories and labels error_categories = { "Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"], "Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"], "Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"], "Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"], "Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"] } # Error severity ranges error_severity = { "Message Cut-off": (1, 5), "Prompt Continuation": (3, 5), "Blank Response": (1, 1), "Markup Language": (1, 6), "Factually Inaccurate": (1, 4), "Factually Incomplete": (3, 6), "Inaccurately Applied": (3, 3), "Misleading Impression": (3, 3), "Spelling/Grammar/Usage": (2, 6), "Persona": (1, 6), "Formatting": (5, 6), "Content": (1, 6), "Hallucination": (1, 2), "False Attribution": (2, 2), "False Capabilities": (2, 3), "Spatiotemporal Awareness": (2, 3), "Full Non-Compliance": (1, 1), "Partial Non-Compliance": (2, 3) } def calculate_ratings(selected_errors): helpfulness_score = 7 honesty_score = 7 for error in selected_errors: min_score, max_score = error_severity[error] if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]: honesty_score = min(honesty_score, min_score) else: helpfulness_score = min(helpfulness_score, min_score) return helpfulness_score, honesty_score def check_consistency(selected_errors, error_rationale, helpfulness, honesty): consistency_issues = [] # Check if ratings are 7 when no errors are selected if not selected_errors: if helpfulness < 7 or honesty < 7: consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.") else: # Check if error rationale is provided when errors are selected if not error_rationale: consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.") # Check if rationale follows the "Error label - Quote - Reason" format if selected_errors: for error in selected_errors: if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale): consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.") calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors) # Check if manual ratings are higher than calculated ratings if helpfulness > calculated_helpfulness: consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.") if honesty > calculated_honesty: consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.") # Check for severe honesty errors honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"] if any(error in honesty_errors for error in selected_errors) and honesty > 3: consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.") # Check for multiple errors if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5): consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.") # Remind about Honesty priority if honesty < helpfulness: consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.") return consistency_issues def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2): score1 = honesty1 * 2 + helpfulness1 score2 = honesty2 * 2 + helpfulness2 if score1 > score2: return "Completion 1 is ranked higher" elif score2 > score1: return "Completion 2 is ranked higher" else: return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness." def generate_error_rationale_template(selected_errors): template = "Please provide a rationale for each selected error using the format below:\n\n" for error in selected_errors: template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n" return template def process_completion(selected_errors, error_rationale, helpfulness, honesty): calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors) consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty) return { "calculated_helpfulness": calculated_helpfulness, "calculated_honesty": calculated_honesty, "manual_helpfulness": helpfulness, "manual_honesty": honesty, "consistency_issues": consistency_issues } def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1, selected_errors2, error_rationale2, helpfulness2, honesty2): result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1) result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2) ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"], result2["manual_helpfulness"], result2["manual_honesty"]) return ( result1["calculated_helpfulness"], result1["calculated_honesty"], "\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.", result2["calculated_helpfulness"], result2["calculated_honesty"], "\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.", ranking ) with gr.Blocks() as app: gr.Markdown("# Tool") gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.") with gr.Accordion("Tutorial and Directions", open=False): gr.Markdown(""" This tool helps you evaluate and compare two AI completions according to the guidelines. ### Step-by-Step Guide: 1. **Error Label Selection**: - Select all applicable error labels for each completion. - Refer to the guidelines for error type descriptions. 2. **Error Rationale**: - Explain why you selected each error label. - Be specific and reference the completion text. 3. **Manual Ratings**: - Assign Helpfulness and Honesty ratings (1-7) for each completion. - Consider error severity in your ratings. 4. **Evaluate Completions**: - Click "Evaluate Completions" to process inputs. 5. **Review Results**: - Check calculated ratings and consistency issues. - Review the suggested ranking. 6. **Adjust if Necessary**: - Address any consistency issues by revisiting your selections and ratings. - Ensure alignment with xAI guidelines. ### Key Reminders: - Honesty is weighted more than Helpfulness in the final ranking. - Always provide an error rationale when selecting error labels. - If no errors are selected, both Helpfulness and Honesty should be 7. """) with gr.Tabs(): with gr.TabItem("Completion 1"): with gr.Row(): with gr.Column(): selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors") with gr.Column(): error_rationale1 = gr.Textbox(label="Error Rationale", lines=5) with gr.Row(): helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating") honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating") with gr.TabItem("Completion 2"): with gr.Row(): with gr.Column(): selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors") with gr.Column(): error_rationale2 = gr.Textbox(label="Error Rationale", lines=5) with gr.Row(): helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating") honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating") submit_button = gr.Button("Evaluate Completions") with gr.Accordion("Results", open=True): with gr.Row(): with gr.Column(): gr.Markdown("### Completion 1") output_helpfulness = gr.Number(label="Calculated Helpfulness") output_honesty = gr.Number(label="Calculated Honesty") output_consistency = gr.Textbox(label="Consistency Check", lines=3) with gr.Column(): gr.Markdown("### Completion 2") output_helpfulness2 = gr.Number(label="Calculated Helpfulness") output_honesty2 = gr.Number(label="Calculated Honesty") output_consistency2 = gr.Textbox(label="Consistency Check", lines=3) output_ranking = gr.Textbox(label="Ranking Suggestion") def update_error_rationale(selected_errors): return generate_error_rationale_template(selected_errors) selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1]) selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2]) submit_button.click( process_completions, inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1, selected_errors2, error_rationale2, helpfulness2, honesty2], outputs=[output_helpfulness, output_honesty, output_consistency, output_helpfulness2, output_honesty2, output_consistency2, output_ranking] ) app.launch()