Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- scripts/download_files.py +3 -1
- scripts/evaluate_factual_robustness.py +8 -8
- scripts/evaluate_information_integration.py +5 -5
- scripts/evaluate_negative_rejection.py +7 -7
- scripts/evaluate_noise_robustness.py +5 -5
- scripts/get_factual_evaluation.py +4 -4
- scripts/get_prediction_result.py +7 -7
- scripts/helper.py +3 -3
scripts/download_files.py
CHANGED
|
@@ -7,6 +7,9 @@ LOCAL_SAVE_PATH = "data" # Path where files will be saved
|
|
| 7 |
GITHUB_API_URL = "https://api.github.com/repos/chen700564/RGB/contents/data"
|
| 8 |
RAW_BASE_URL = "https://raw.githubusercontent.com/chen700564/RGB/master/data/"
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def get_file_list():
|
| 11 |
"""Fetch the list of files from the GitHub repository."""
|
| 12 |
response = requests.get(GITHUB_API_URL)
|
|
@@ -30,7 +33,6 @@ def download_file(file_name):
|
|
| 30 |
|
| 31 |
file_url = RAW_BASE_URL + file_name
|
| 32 |
local_file_path = os.path.join(LOCAL_SAVE_PATH, file_name)
|
| 33 |
-
|
| 34 |
response = requests.get(file_url, stream=True)
|
| 35 |
if response.status_code == 200:
|
| 36 |
total_size = int(response.headers.get("content-length", 0))
|
|
|
|
| 7 |
GITHUB_API_URL = "https://api.github.com/repos/chen700564/RGB/contents/data"
|
| 8 |
RAW_BASE_URL = "https://raw.githubusercontent.com/chen700564/RGB/master/data/"
|
| 9 |
|
| 10 |
+
# Ensure the directory exists before downloading
|
| 11 |
+
os.makedirs(LOCAL_SAVE_PATH, exist_ok=True)
|
| 12 |
+
|
| 13 |
def get_file_list():
|
| 14 |
"""Fetch the list of files from the GitHub repository."""
|
| 15 |
response = requests.get(GITHUB_API_URL)
|
|
|
|
| 33 |
|
| 34 |
file_url = RAW_BASE_URL + file_name
|
| 35 |
local_file_path = os.path.join(LOCAL_SAVE_PATH, file_name)
|
|
|
|
| 36 |
response = requests.get(file_url, stream=True)
|
| 37 |
if response.status_code == 200:
|
| 38 |
total_size = int(response.headers.get("content-length", 0))
|
scripts/evaluate_factual_robustness.py
CHANGED
|
@@ -9,15 +9,15 @@ from scripts.prompt import get_factual_prompt
|
|
| 9 |
|
| 10 |
def evaluate_factual_robustness(config):
|
| 11 |
"""Evaluates negative rejection for a given model by processing predictions and computing scores."""
|
| 12 |
-
config[
|
| 13 |
-
modelname = config[
|
| 14 |
-
noise_rate = config[
|
| 15 |
-
passage_num = config[
|
| 16 |
|
| 17 |
-
if config[
|
| 18 |
-
model = GroqClient(plm=config[
|
| 19 |
else:
|
| 20 |
-
logging.warning(f"Skipping unknown model: {config[
|
| 21 |
return
|
| 22 |
|
| 23 |
# File paths
|
|
@@ -84,7 +84,7 @@ def evaluate_factual_robustness(config):
|
|
| 84 |
'rejecttt':rejecttt,
|
| 85 |
'correct_tt':correct_tt,
|
| 86 |
'nums': len(results),
|
| 87 |
-
'noise_rate': config[
|
| 88 |
}
|
| 89 |
return scores
|
| 90 |
|
|
|
|
| 9 |
|
| 10 |
def evaluate_factual_robustness(config):
|
| 11 |
"""Evaluates negative rejection for a given model by processing predictions and computing scores."""
|
| 12 |
+
config['noise_rate'] = 0.4 # Time being to do clarification
|
| 13 |
+
modelname = config['model_name']
|
| 14 |
+
noise_rate = config['noise_rate']
|
| 15 |
+
passage_num = config['passage_num']
|
| 16 |
|
| 17 |
+
if config['model_name'] in config["models"]:
|
| 18 |
+
model = GroqClient(plm=config['model_name'])
|
| 19 |
else:
|
| 20 |
+
logging.warning(f"Skipping unknown model: {config['model_name']}")
|
| 21 |
return
|
| 22 |
|
| 23 |
# File paths
|
|
|
|
| 84 |
'rejecttt':rejecttt,
|
| 85 |
'correct_tt':correct_tt,
|
| 86 |
'nums': len(results),
|
| 87 |
+
'noise_rate': config['noise_rate'],
|
| 88 |
}
|
| 89 |
return scores
|
| 90 |
|
scripts/evaluate_information_integration.py
CHANGED
|
@@ -11,11 +11,11 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def evaluate_information_integration(config):
|
| 13 |
result_path = config["result_path"] + 'Information Integration/'
|
| 14 |
-
noise_rate = config[
|
| 15 |
-
passage_num = config[
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
-
filename = os.path.join(result_path, f'prediction_{config[
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
@@ -45,7 +45,7 @@ def evaluate_information_integration(config):
|
|
| 45 |
|
| 46 |
# Save the final score file with tt and all_rate
|
| 47 |
scores = {
|
| 48 |
-
'model': config[
|
| 49 |
'accuracy': accuracy,
|
| 50 |
'noise_rate': noise_rate,
|
| 51 |
'correct_count': correct_count,
|
|
@@ -56,7 +56,7 @@ def evaluate_information_integration(config):
|
|
| 56 |
logging.info(f"Score: {scores}")
|
| 57 |
logging.info(f"Information Integration Accuracy: {accuracy:.2%}")
|
| 58 |
|
| 59 |
-
score_filename = os.path.join(result_path, f'scores_{config[
|
| 60 |
with open(score_filename, 'w') as f:
|
| 61 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 62 |
|
|
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def evaluate_information_integration(config):
|
| 13 |
result_path = config["result_path"] + 'Information Integration/'
|
| 14 |
+
noise_rate = config['noise_rate']
|
| 15 |
+
passage_num = config['passage_num']
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
+
filename = os.path.join(result_path, f'prediction_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
|
|
| 45 |
|
| 46 |
# Save the final score file with tt and all_rate
|
| 47 |
scores = {
|
| 48 |
+
'model': config['model_name'],
|
| 49 |
'accuracy': accuracy,
|
| 50 |
'noise_rate': noise_rate,
|
| 51 |
'correct_count': correct_count,
|
|
|
|
| 56 |
logging.info(f"Score: {scores}")
|
| 57 |
logging.info(f"Information Integration Accuracy: {accuracy:.2%}")
|
| 58 |
|
| 59 |
+
score_filename = os.path.join(result_path, f'scores_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 60 |
with open(score_filename, 'w') as f:
|
| 61 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 62 |
|
scripts/evaluate_negative_rejection.py
CHANGED
|
@@ -10,15 +10,15 @@ from scripts.prompt import get_prompt
|
|
| 10 |
|
| 11 |
def evaluate_negative_rejection(config):
|
| 12 |
"""Evaluates negative rejection for a given model by processing predictions and computing scores."""
|
| 13 |
-
config[
|
| 14 |
-
modelname = config[
|
| 15 |
-
noise_rate = config[
|
| 16 |
-
passage_num = config[
|
| 17 |
|
| 18 |
-
if config[
|
| 19 |
-
model = GroqClient(plm=config[
|
| 20 |
else:
|
| 21 |
-
logging.warning(f"Skipping unknown model: {config[
|
| 22 |
return
|
| 23 |
|
| 24 |
# File paths
|
|
|
|
| 10 |
|
| 11 |
def evaluate_negative_rejection(config):
|
| 12 |
"""Evaluates negative rejection for a given model by processing predictions and computing scores."""
|
| 13 |
+
config['noise_rate'] = 1.0 # Noise rate should be 1.0 for negative rejection evaluation
|
| 14 |
+
modelname = config['model_name']
|
| 15 |
+
noise_rate = config['noise_rate']
|
| 16 |
+
passage_num = config['passage_num']
|
| 17 |
|
| 18 |
+
if config['model_name'] in config["models"]:
|
| 19 |
+
model = GroqClient(plm=config['model_name'])
|
| 20 |
else:
|
| 21 |
+
logging.warning(f"Skipping unknown model: {config['model_name']}")
|
| 22 |
return
|
| 23 |
|
| 24 |
# File paths
|
scripts/evaluate_noise_robustness.py
CHANGED
|
@@ -11,11 +11,11 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def evaluate_noise_robustness(config):
|
| 13 |
result_path = config["result_path"] + 'Noise Robustness/'
|
| 14 |
-
noise_rate = config[
|
| 15 |
-
passage_num = config[
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
-
filename = os.path.join(result_path, f'prediction_{config[
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
@@ -45,7 +45,7 @@ def evaluate_noise_robustness(config):
|
|
| 45 |
|
| 46 |
# Save the final score file with tt and all_rate
|
| 47 |
scores = {
|
| 48 |
-
'model': config[
|
| 49 |
'accuracy': accuracy,
|
| 50 |
'noise_rate': noise_rate,
|
| 51 |
'correct_count': correct_count,
|
|
@@ -56,7 +56,7 @@ def evaluate_noise_robustness(config):
|
|
| 56 |
logging.info(f"score: {scores}")
|
| 57 |
logging.info(f"Noise Robustness Accuracy: {accuracy:.2%}")
|
| 58 |
|
| 59 |
-
score_filename = os.path.join(result_path, f'scores_{config[
|
| 60 |
with open(score_filename, 'w') as f:
|
| 61 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 62 |
|
|
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def evaluate_noise_robustness(config):
|
| 13 |
result_path = config["result_path"] + 'Noise Robustness/'
|
| 14 |
+
noise_rate = config['noise_rate']
|
| 15 |
+
passage_num = config['passage_num']
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
+
filename = os.path.join(result_path, f'prediction_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
|
|
| 45 |
|
| 46 |
# Save the final score file with tt and all_rate
|
| 47 |
scores = {
|
| 48 |
+
'model': config['model_name'],
|
| 49 |
'accuracy': accuracy,
|
| 50 |
'noise_rate': noise_rate,
|
| 51 |
'correct_count': correct_count,
|
|
|
|
| 56 |
logging.info(f"score: {scores}")
|
| 57 |
logging.info(f"Noise Robustness Accuracy: {accuracy:.2%}")
|
| 58 |
|
| 59 |
+
score_filename = os.path.join(result_path, f'scores_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 60 |
with open(score_filename, 'w') as f:
|
| 61 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 62 |
|
scripts/get_factual_evaluation.py
CHANGED
|
@@ -11,11 +11,11 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def get_factual_evaluation(config):
|
| 13 |
result_path = config["result_path"] + 'Counterfactual Robustness/'
|
| 14 |
-
noise_rate = config[
|
| 15 |
-
passage_num = config[
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
-
filename = os.path.join(result_path, f'prediction_{config[
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
@@ -61,7 +61,7 @@ def get_factual_evaluation(config):
|
|
| 61 |
scores['correct_tt'] = correct_tt
|
| 62 |
|
| 63 |
#logging.info(f"score: {scores}")
|
| 64 |
-
score_filename = os.path.join(result_path, f'scores_{config[
|
| 65 |
with open(score_filename, 'w') as f:
|
| 66 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 67 |
|
|
|
|
| 11 |
# Improved function to evaluate noise robustness
|
| 12 |
def get_factual_evaluation(config):
|
| 13 |
result_path = config["result_path"] + 'Counterfactual Robustness/'
|
| 14 |
+
noise_rate = config['noise_rate']
|
| 15 |
+
passage_num = config['passage_num']
|
| 16 |
|
| 17 |
# Iterate over each model specified in the config
|
| 18 |
+
filename = os.path.join(result_path, f'prediction_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 19 |
ensure_directory_exists(filename)
|
| 20 |
|
| 21 |
# Load existing results if file exists
|
|
|
|
| 61 |
scores['correct_tt'] = correct_tt
|
| 62 |
|
| 63 |
#logging.info(f"score: {scores}")
|
| 64 |
+
score_filename = os.path.join(result_path, f'scores_{config['model_name']}_noise_{noise_rate}_passage_{passage_num}.json')
|
| 65 |
with open(score_filename, 'w') as f:
|
| 66 |
json.dump(scores, f, ensure_ascii=False, indent=4)
|
| 67 |
|
scripts/get_prediction_result.py
CHANGED
|
@@ -13,17 +13,17 @@ def get_prediction_result(config, data_file_name):
|
|
| 13 |
results = []
|
| 14 |
dataset = load_dataset(data_file_name)
|
| 15 |
# Create GroqClient instance for supported models
|
| 16 |
-
if config[
|
| 17 |
-
model = GroqClient(plm=config[
|
| 18 |
else:
|
| 19 |
-
logging.warning(f"Skipping unknown model: {config[
|
| 20 |
return
|
| 21 |
|
| 22 |
# Iterate through dataset and process queries
|
| 23 |
-
for idx, instance in enumerate(dataset[:config[
|
| 24 |
-
logging.info(f"Executing Query {idx + 1} for Model: {config[
|
| 25 |
|
| 26 |
-
query, ans, docs = process_data(instance, config[
|
| 27 |
|
| 28 |
# Retry mechanism for prediction
|
| 29 |
for attempt in range(1, config["retry_attempts"] + 1):
|
|
@@ -46,7 +46,7 @@ def get_prediction_result(config, data_file_name):
|
|
| 46 |
'label': label,
|
| 47 |
'prediction': prediction,
|
| 48 |
'docs': docs,
|
| 49 |
-
'noise_rate': config[
|
| 50 |
'factlabel': factlabel
|
| 51 |
}
|
| 52 |
results.append(new_instance)
|
|
|
|
| 13 |
results = []
|
| 14 |
dataset = load_dataset(data_file_name)
|
| 15 |
# Create GroqClient instance for supported models
|
| 16 |
+
if config['model_name'] in config["models"]:
|
| 17 |
+
model = GroqClient(plm=config['model_name'])
|
| 18 |
else:
|
| 19 |
+
logging.warning(f"Skipping unknown model: {config['model_name']}")
|
| 20 |
return
|
| 21 |
|
| 22 |
# Iterate through dataset and process queries
|
| 23 |
+
for idx, instance in enumerate(dataset[:config['num_queries']], start=0):
|
| 24 |
+
logging.info(f"Executing Query {idx + 1} for Model: {config['model_name']}")
|
| 25 |
|
| 26 |
+
query, ans, docs = process_data(instance, config['noise_rate'], config['passage_num'], data_file_name)
|
| 27 |
|
| 28 |
# Retry mechanism for prediction
|
| 29 |
for attempt in range(1, config["retry_attempts"] + 1):
|
|
|
|
| 46 |
'label': label,
|
| 47 |
'prediction': prediction,
|
| 48 |
'docs': docs,
|
| 49 |
+
'noise_rate': config['noise_rate'],
|
| 50 |
'factlabel': factlabel
|
| 51 |
}
|
| 52 |
results.append(new_instance)
|
scripts/helper.py
CHANGED
|
@@ -31,11 +31,11 @@ def update_config(config, model_name=None, noise_rate=None, num_queries=None):
|
|
| 31 |
dict: The updated configuration dictionary.
|
| 32 |
"""
|
| 33 |
if model_name:
|
| 34 |
-
config[
|
| 35 |
if noise_rate is not None: # Explicitly check for None to handle 0.0
|
| 36 |
-
config[
|
| 37 |
if num_queries is not None: # Explicitly check for None to handle 0
|
| 38 |
-
config[
|
| 39 |
return config
|
| 40 |
|
| 41 |
def load_dataset(file_name):
|
|
|
|
| 31 |
dict: The updated configuration dictionary.
|
| 32 |
"""
|
| 33 |
if model_name:
|
| 34 |
+
config['model_name'] = model_name
|
| 35 |
if noise_rate is not None: # Explicitly check for None to handle 0.0
|
| 36 |
+
config['noise_rate'] = float(noise_rate) # Ensure it's a float
|
| 37 |
if num_queries is not None: # Explicitly check for None to handle 0
|
| 38 |
+
config['num_queries'] = int(num_queries) # Ensure it's an integer
|
| 39 |
return config
|
| 40 |
|
| 41 |
def load_dataset(file_name):
|