import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["HF_HOME"] = "../../cache/hgCache" os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/" import glob import logging import sys from collections import defaultdict import numpy as np import pytrec_eval import tqdm, torch import pandas as pd from pylate import models, rank document_length = 512 model_name_or_paths = [ "9eren99/TrColBERT", "jinaai/jina-colbert-v2", "antoinelouis/colbert-xm", ] datasetnames = [ "fiqa2018", "climatefever", "dbpedia", "fever", "hotpotqa", # "msmarco", "nfcorpus", "nq", "quoraretrieval", "scidocs", "arguana", "scifact", "touche2020", ] for datasetname in datasetnames: print("#############", datasetname, "##############") evalResultsDf = None for model_name_or_path in model_name_or_paths: torch.cuda.empty_cache() if "jinaai/jina-colbert-v2" == model_name_or_path: model = models.ColBERT( model_name_or_path=model_name_or_path, query_prefix="[QueryMarker]", document_prefix="[DocumentMarker]", attend_to_expansion_tokens=True, trust_remote_code=True, document_length=document_length, ) elif "antoinelouis/colbert-xm" == model_name_or_path: model = models.ColBERT(model_name_or_path="antoinelouis/colbert-xm") language = "tr_TR" # Use a code from https://huggingface.co/facebook/xmod-base#languages backbone = model[0].auto_model if backbone.__class__.__name__.lower().startswith("xmod"): backbone.set_default_language(language) else: model = models.ColBERT( model_name_or_path=model_name_or_path, document_length=document_length, attend_to_expansion_tokens=( True if "attend" in model_name_or_path else False ), ) model.eval() model.to("cuda") dfDocs = pd.read_parquet( f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet" ).dropna() dfQueries = pd.read_parquet( f"datasets/{datasetname}/queries/train-00000-of-00001.parquet" ).dropna() if "99eren99/TrColBERT" == model_name_or_path: try: model.tokenizer.model_input_names.remove("token_type_ids") except: print(model_name_or_path) dfDocs.TurkishText = dfDocs.TurkishText.apply( lambda x: x.replace("İ", "i").replace("I", "ı").lower() ) dfQueries.TurkishText = dfQueries.TurkishText.apply( lambda x: x.replace("İ", "i").replace("I", "ı").lower() ) # Read test queries queries = [] documents = [] passage_cand = {} relevant_qid = [] relevant_docs = defaultdict(lambda: defaultdict(int)) # read corpus newId2oldId_Docs = {} for i, row in enumerate(dfDocs.values): documents.append(row[2]) newId2oldId_Docs[i] = str(row[0]) relevant_qid.append(str(row[0])) # read queries newId2oldId_Queries = {} for i, row in enumerate(dfQueries.values): queries.append(row[2]) newId2oldId_Queries[i] = str(row[0]) for j, rowDoc in enumerate(dfDocs.values): relevant_docs[str(row[0])][str(rowDoc[0])] = 0 # read qrels dfQrels = pd.read_parquet( f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet" ) for i, row in enumerate(dfQrels.values): relevant_docs[str(row[0])][str(row[1])] = 1 candidateIds = [[i for i in range(len(documents))]] queries_result_list = [] run = {} documents_embeddings = model.encode( [documents], is_query=False, show_progress_bar=True ) for i, query in enumerate(tqdm.tqdm(queries)): queries_embeddings = model.encode( [query], is_query=True, ) reranked_documents = rank.rerank( documents_ids=candidateIds, queries_embeddings=queries_embeddings, documents_embeddings=documents_embeddings, ) run[newId2oldId_Queries[i]] = {} for resDict in reranked_documents[0]: run[newId2oldId_Queries[i]][newId2oldId_Docs[resDict["id"]]] = float( resDict["score"] ) evaluator = pytrec_eval.RelevanceEvaluator( relevant_docs, pytrec_eval.supported_measures ) scores = evaluator.evaluate(run) def print_line(measure, scope, value): print("{:25s}{:8s}{:.4f}".format(measure, scope, value)) for query_id, query_measures in sorted(scores.items()): break for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. resultsColumns = ["model name"] resultsRow = [model_name_or_path] for measure in sorted(query_measures.keys()): resultsColumns.append(measure) resultsRow.append( pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in scores.values()], ) ) if evalResultsDf is None: evalResultsDf = pd.DataFrame(columns=resultsColumns) evalResultsDf.loc[-1] = resultsRow evalResultsDf.index = evalResultsDf.index + 1 evalResultsDf.to_csv(f"resultsn/{datasetname}.csv", encoding="utf-8")