Spaces:
Sleeping
Sleeping
import logging | |
import random | |
import math | |
def process_data(instance, noise_rate, passage_num, filename, correct_rate=0): | |
print(filename) | |
"""Process the data for generating a noisy document set.""" | |
query = instance['query'] | |
ans = instance['answer'] | |
logging.info(f"Query: {query}") | |
logging.info(f"Answer: {ans}") | |
neg_num = math.ceil(passage_num * noise_rate) | |
pos_num = passage_num - neg_num | |
docs = [] | |
# Handling the '_int' case in filename | |
if '_int' in filename: | |
for i in instance['positive']: | |
random.shuffle(i) | |
docs = [i[0] for i in instance['positive']] | |
if len(docs) < pos_num: | |
maxnum = max([len(i) for i in instance['positive']]) | |
for i in range(1, maxnum): | |
for j in instance['positive']: | |
if len(j) > i: | |
docs.append(j[i]) | |
if len(docs) == pos_num: | |
break | |
if len(docs) == pos_num: | |
break | |
neg_num = passage_num - len(docs) | |
if neg_num > 0: | |
negative = instance['negative'][:neg_num] | |
docs += negative | |
# Handling the '_fact' case in filename | |
elif '_fact' in filename: | |
correct_num = math.ceil(passage_num * correct_rate) | |
pos_num = passage_num - neg_num - correct_num | |
indexs = list(range(len(instance['positive']))) | |
selected = random.sample(indexs, min(len(indexs), pos_num)) | |
docs = [instance['positive_wrong'][i] for i in selected] | |
remain = [i for i in indexs if i not in selected] | |
if correct_num > 0 and len(remain) > 0: | |
docs += [instance['positive'][i] for i in random.sample(remain, min(len(remain), correct_num))] | |
if neg_num > 0: | |
docs += instance['negative'][:neg_num] | |
# Default case (when filename doesn't match '_int' or '_fact') | |
else: | |
if noise_rate == 1: | |
neg_num = passage_num | |
pos_num = 0 | |
else: | |
if neg_num > len(instance['negative']): | |
neg_num = len(instance['negative']) | |
elif pos_num > len(instance['positive']): | |
pos_num = len(instance['positive']) | |
positive = instance['positive'][:pos_num] | |
negative = instance['negative'][:neg_num] | |
docs = positive + negative | |
# Count the positive and negative documents | |
num_positive = sum(1 for doc in docs if doc in positive) | |
num_negative = sum(1 for doc in docs if doc in negative) | |
logging.info(f"Using {num_positive} positive and {num_negative} negative documents as context") | |
# Shuffle the final document list | |
random.shuffle(docs) | |
return query, ans, docs | |