import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["HF_HOME"] = "../../cache/hgCache" os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/" import gzip import logging import sys from collections import defaultdict import numpy as np import pytrec_eval import tqdm import pandas as pd from pylate import models, rank from FlagEmbedding import BGEM3FlagModel datasetnames = [ "fiqa2018", "climatefever", "dbpedia", "fever", "fiqa2018", "hotpotqa", # "msmarco", "nfcorpus", "nq", "quoraretrieval", "scidocs", "arguana", "scifact", "touche2020", ] model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True) for datasetname in datasetnames: evalResultsDf = None dfDocs = pd.read_parquet( f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet" ).dropna() dfQueries = pd.read_parquet( f"datasets/{datasetname}/queries/train-00000-of-00001.parquet" ).dropna() # Read test queries queries = [] documents = [] passage_cand = {} relevant_qid = [] relevant_docs = defaultdict(lambda: defaultdict(int)) # read corpus newId2oldId_Docs = {} for i, row in enumerate(dfDocs.values): documents.append(row[2]) newId2oldId_Docs[i] = str(row[0]) relevant_qid.append(str(row[0])) # read queries newId2oldId_Queries = {} for i, row in enumerate(dfQueries.values): queries.append(row[2]) newId2oldId_Queries[i] = str(row[0]) for j, rowDoc in enumerate(dfDocs.values): relevant_docs[str(row[0])][str(rowDoc[0])] = 0 # read qrels dfQrels = pd.read_parquet( f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet" ) for i, row in enumerate(dfQrels.values): relevant_docs[str(row[0])][str(row[1])] = 1 candidateIds = [[i for i in range(len(documents))]] queries_result_list = [] run = {} document_embeddings = model.encode( documents, batch_size=4, max_length=512, return_dense=True, return_sparse=True, return_colbert_vecs=True, ) for i, query in enumerate(tqdm.tqdm(queries)): queries_embeddings = model.encode( [query], max_length=32, return_dense=True, return_sparse=True, return_colbert_vecs=True, ) similarities = [] for j in range(len(documents)): similarities.append( model.colbert_score( queries_embeddings["colbert_vecs"][0], document_embeddings["colbert_vecs"][j], ) ) run[newId2oldId_Queries[i]] = {} for j, score in enumerate(similarities): run[newId2oldId_Queries[i]][newId2oldId_Docs[j]] = float(score) evaluator = pytrec_eval.RelevanceEvaluator( relevant_docs, pytrec_eval.supported_measures ) scores = evaluator.evaluate(run) def print_line(measure, scope, value): print("{:25s}{:8s}{:.4f}".format(measure, scope, value)) for query_id, query_measures in sorted(scores.items()): break for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. resultsColumns = ["model name"] resultsRow = ["bgem3"] for measure in sorted(query_measures.keys()): resultsColumns.append(measure) resultsRow.append( pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in scores.values()] ) ) if evalResultsDf is None: evalResultsDf = pd.DataFrame(columns=resultsColumns) evalResultsDf.loc[-1] = resultsRow evalResultsDf.index = evalResultsDf.index + 1 evalResultsDf.to_csv(f"results/{datasetname}_bgem3.csv", encoding="utf-8")