Spaces:

Severian
/

Tool

Sleeping

App Files Files Community

Tool / app.py

Severian

Update app.py

41b3727 verified 12 months ago

raw

history blame contribute delete

10.9 kB

	import gradio as gr
	import numpy as np
	import re

	# Error categories and labels
	error_categories = {
	"Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"],
	"Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"],
	"Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"],
	"Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"],
	"Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"]
	}

	# Error severity ranges
	error_severity = {
	"Message Cut-off": (1, 5),
	"Prompt Continuation": (3, 5),
	"Blank Response": (1, 1),
	"Markup Language": (1, 6),
	"Factually Inaccurate": (1, 4),
	"Factually Incomplete": (3, 6),
	"Inaccurately Applied": (3, 3),
	"Misleading Impression": (3, 3),
	"Spelling/Grammar/Usage": (2, 6),
	"Persona": (1, 6),
	"Formatting": (5, 6),
	"Content": (1, 6),
	"Hallucination": (1, 2),
	"False Attribution": (2, 2),
	"False Capabilities": (2, 3),
	"Spatiotemporal Awareness": (2, 3),
	"Full Non-Compliance": (1, 1),
	"Partial Non-Compliance": (2, 3)
	}

	def calculate_ratings(selected_errors):
	helpfulness_score = 7
	honesty_score = 7

	for error in selected_errors:
	min_score, max_score = error_severity[error]
	if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]:
	honesty_score = min(honesty_score, min_score)
	else:
	helpfulness_score = min(helpfulness_score, min_score)

	return helpfulness_score, honesty_score

	def check_consistency(selected_errors, error_rationale, helpfulness, honesty):
	consistency_issues = []

	# Check if ratings are 7 when no errors are selected
	if not selected_errors:
	if helpfulness < 7 or honesty < 7:
	consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.")
	else:
	# Check if error rationale is provided when errors are selected
	if not error_rationale:
	consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.")

	# Check if rationale follows the "Error label - Quote - Reason" format
	if selected_errors:
	for error in selected_errors:
	if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale):
	consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.")

	calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)

	# Check if manual ratings are higher than calculated ratings
	if helpfulness > calculated_helpfulness:
	consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.")
	if honesty > calculated_honesty:
	consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.")

	# Check for severe honesty errors
	honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]
	if any(error in honesty_errors for error in selected_errors) and honesty > 3:
	consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.")

	# Check for multiple errors
	if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5):
	consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.")

	# Remind about Honesty priority
	if honesty < helpfulness:
	consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.")

	return consistency_issues

	def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2):
	score1 = honesty1 * 2 + helpfulness1
	score2 = honesty2 * 2 + helpfulness2

	if score1 > score2:
	return "Completion 1 is ranked higher"
	elif score2 > score1:
	return "Completion 2 is ranked higher"
	else:
	return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness."

	def generate_error_rationale_template(selected_errors):
	template = "Please provide a rationale for each selected error using the format below:\n\n"
	for error in selected_errors:
	template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n"
	return template

	def process_completion(selected_errors, error_rationale, helpfulness, honesty):
	calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
	consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty)

	return {
	"calculated_helpfulness": calculated_helpfulness,
	"calculated_honesty": calculated_honesty,
	"manual_helpfulness": helpfulness,
	"manual_honesty": honesty,
	"consistency_issues": consistency_issues
	}

	def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1,
	selected_errors2, error_rationale2, helpfulness2, honesty2):
	result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1)
	result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2)

	ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"],
	result2["manual_helpfulness"], result2["manual_honesty"])

	return (
	result1["calculated_helpfulness"],
	result1["calculated_honesty"],
	"\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.",
	result2["calculated_helpfulness"],
	result2["calculated_honesty"],
	"\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.",
	ranking
	)

	with gr.Blocks() as app:
	gr.Markdown("# Tool")
	gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.")

	with gr.Accordion("Tutorial and Directions", open=False):
	gr.Markdown("""
	This tool helps you evaluate and compare two AI completions according to the guidelines.

	### Step-by-Step Guide:

	1. Error Label Selection:
	- Select all applicable error labels for each completion.
	- Refer to the guidelines for error type descriptions.

	2. Error Rationale:
	- Explain why you selected each error label.
	- Be specific and reference the completion text.

	3. Manual Ratings:
	- Assign Helpfulness and Honesty ratings (1-7) for each completion.
	- Consider error severity in your ratings.

	4. Evaluate Completions:
	- Click "Evaluate Completions" to process inputs.

	5. Review Results:
	- Check calculated ratings and consistency issues.
	- Review the suggested ranking.

	6. Adjust if Necessary:
	- Address any consistency issues by revisiting your selections and ratings.
	- Ensure alignment with xAI guidelines.

	### Key Reminders:
	- Honesty is weighted more than Helpfulness in the final ranking.
	- Always provide an error rationale when selecting error labels.
	- If no errors are selected, both Helpfulness and Honesty should be 7.
	""")

	with gr.Tabs():
	with gr.TabItem("Completion 1"):
	with gr.Row():
	with gr.Column():
	selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
	with gr.Column():
	error_rationale1 = gr.Textbox(label="Error Rationale", lines=5)
	with gr.Row():
	helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
	honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")

	with gr.TabItem("Completion 2"):
	with gr.Row():
	with gr.Column():
	selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
	with gr.Column():
	error_rationale2 = gr.Textbox(label="Error Rationale", lines=5)
	with gr.Row():
	helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
	honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")

	submit_button = gr.Button("Evaluate Completions")

	with gr.Accordion("Results", open=True):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Completion 1")
	output_helpfulness = gr.Number(label="Calculated Helpfulness")
	output_honesty = gr.Number(label="Calculated Honesty")
	output_consistency = gr.Textbox(label="Consistency Check", lines=3)
	with gr.Column():
	gr.Markdown("### Completion 2")
	output_helpfulness2 = gr.Number(label="Calculated Helpfulness")
	output_honesty2 = gr.Number(label="Calculated Honesty")
	output_consistency2 = gr.Textbox(label="Consistency Check", lines=3)
	output_ranking = gr.Textbox(label="Ranking Suggestion")

	def update_error_rationale(selected_errors):
	return generate_error_rationale_template(selected_errors)

	selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1])
	selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2])

	submit_button.click(
	process_completions,
	inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1,
	selected_errors2, error_rationale2, helpfulness2, honesty2],
	outputs=[output_helpfulness, output_honesty, output_consistency,
	output_helpfulness2, output_honesty2, output_consistency2,
	output_ranking]
	)

	app.launch()