Spaces:

dogukan-bg
/

NLP4WebSpace

Sleeping

App Files Files Community

dogukan-bg commited on Nov 21, 2024

Commit

e77e36b

verified ·

1 Parent(s): 5c2f03c

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

app.py +888 -0
nlp4web-codebase/.gitignore +134 -0
nlp4web-codebase/README.md +2 -0
nlp4web-codebase/nlp4web_codebase/__init__.py +0 -0
nlp4web-codebase/nlp4web_codebase/ir/__init__.py +0 -0
nlp4web-codebase/nlp4web_codebase/ir/analysis.py +160 -0
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/__init__.py +35 -0
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/dm.py +22 -0
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/sciq.py +86 -0
nlp4web-codebase/nlp4web_codebase/ir/models/__init__.py +21 -0
nlp4web-codebase/requirements.txt +1 -0
nlp4web-codebase/setup.py +37 -0
output/bm25_index/index.pkl +3 -0
output/csc_bm25_index/index.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,888 @@

+# -*- coding: utf-8 -*-
+"""Copy of HW1 (more instructed).ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/14KOszSHjoAmlL_IGF2Ixz_3rdI4I3Z-J
+"""
+!pip install git+https://github.com/kwang2049/nlp4web-codebase.git
+!git clone https://github.com/kwang2049/nlp4web-codebase.git  # You can always check the content of this simple codebase at any time
+!pip install gradio  # we also need this additionally for this homework
+"""## Pre-requisite code
+The code within this section will be used in the tasks. Please do not change these code lines.
+### SciQ loading and counting
+"""
+from dataclasses import dataclass
+import pickle
+import os
+from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from collections import Counter
+import tqdm
+import re
+import nltk
+nltk.download("stopwords", quiet=True)
+from nltk.corpus import stopwords as nltk_stopwords
+LANGUAGE = "english"
+word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
+stopwords = set(nltk_stopwords.words(LANGUAGE))
+def word_splitting(text: str) -> List[str]:
+    return word_splitter(text.lower())
+def lemmatization(words: List[str]) -> List[str]:
+    return words  # We ignore lemmatization here for simplicity
+def simple_tokenize(text: str) -> List[str]:
+    words = word_splitting(text)
+    tokenized = list(filter(lambda w: w not in stopwords, words))
+    tokenized = lemmatization(tokenized)
+    return tokenized
+T = TypeVar("T", bound="InvertedIndex")
+@dataclass
+class PostingList:
+    term: str  # The term
+    docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
+    tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
+@dataclass
+class InvertedIndex:
+    posting_lists: List[PostingList]  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+# The output of the counting function:
+@dataclass
+class Counting:
+    posting_lists: List[PostingList]
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]
+    collection_ids: List[str]
+    dfs: List[int]  # tid -> df
+    dls: List[int]  # docid -> doc length
+    avgdl: float
+    nterms: int
+    doc_texts: Optional[List[str]] = None
+def run_counting(
+    documents: Iterable[Document],
+    tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
+    store_raw: bool = True,  # store the document text in doc_texts
+    ndocs: Optional[int] = None,
+    show_progress_bar: bool = True,
+) -> Counting:
+    """Counting TFs, DFs, doc_lengths, etc."""
+    posting_lists: List[PostingList] = []
+    vocab: Dict[str, int] = {}
+    cid2docid: Dict[str, int] = {}
+    collection_ids: List[str] = []
+    dfs: List[int] = []  # tid -> df
+    dls: List[int] = []  # docid -> doc length
+    nterms: int = 0
+    doc_texts: Optional[List[str]] = []
+    for doc in tqdm.tqdm(
+        documents,
+        desc="Counting",
+        total=ndocs,
+        disable=not show_progress_bar,
+    ):
+        if doc.collection_id in cid2docid:
+            continue
+        collection_ids.append(doc.collection_id)
+        docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
+        toks = tokenize_fn(doc.text)
+        tok2tf = Counter(toks)
+        dls.append(sum(tok2tf.values()))
+        for tok, tf in tok2tf.items():
+            nterms += tf
+            tid = vocab.get(tok, None)
+            if tid is None:
+                posting_lists.append(
+                    PostingList(term=tok, docid_postings=[], tweight_postings=[])
+                )
+                tid = vocab.setdefault(tok, len(vocab))
+            posting_lists[tid].docid_postings.append(docid)
+            posting_lists[tid].tweight_postings.append(tf)
+            if tid < len(dfs):
+                dfs[tid] += 1
+            else:
+                dfs.append(0)
+        if store_raw:
+            doc_texts.append(doc.text)
+        else:
+            doc_texts = None
+    return Counting(
+        posting_lists=posting_lists,
+        vocab=vocab,
+        cid2docid=cid2docid,
+        collection_ids=collection_ids,
+        dfs=dfs,
+        dls=dls,
+        avgdl=sum(dls) / len(dls),
+        nterms=nterms,
+        doc_texts=doc_texts,
+    )
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+"""### BM25 Index"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+import math
+import os
+from typing import Iterable, List, Optional, Type
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
+@dataclass
+class BM25Index(InvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> None:
+        """Compute term weights and caching"""
+        N = total_docs
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
+            idf = BM25Index.calc_idf(df=dfs[tid], N=N)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
+                tf = posting_list.tweight_postings[i]
+                dl = dls[docid]
+                regularized_tf = BM25Index.calc_regularized_tf(
+                    tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                posting_list.tweight_postings[i] = regularized_tf * idf
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[BM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> BM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=BM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        BM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = BM25Index(
+            posting_lists=posting_lists,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+)
+bm25_index.save("output/bm25_index")
+!ls
+"""### BM25 Retriever"""
+from nlp4web_codebase.ir.models import BaseRetriever
+from typing import Type
+from abc import abstractmethod
+class BaseInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[InvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                if docid == target_docid:
+                    term_weights[tok] = tweight
+                    break
+        return term_weights
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        docid2score: Dict[int, float] = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                docid2score.setdefault(docid, 0)
+                docid2score[docid] += tweight
+        docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
+        return {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
+        }
+class BM25Retriever(BaseInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[BM25Index]:
+        return BM25Index
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
+"""# TASK1: tune b and k1 (4 points)
+Tune b and k1 on the **dev** split of SciQ using the metric MAP@10. The evaluation function (`evalaute_map`) is provided. Record the values in `plots_k1` and `plots_b`. Do it in a greedy manner: as the influence from b is larger, please first tune b (with k1 fixed to the default value 0.9) and use the best value of b to further tune k1.
+$${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$
+"""
+from nlp4web_codebase.ir.data_loaders import Split
+import pytrec_eval
+import numpy as np
+def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
+  metric = "map_cut_10"
+  qrels = sciq.get_qrels_dict(split)
+  evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
+  qps = evaluator.evaluate(rankings)
+  return float(np.mean([qp[metric] for qp in qps.values()]))
+"""Example of using the pre-requisite code:"""
+# Loading dataset:
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+# Building BM25 index and save:
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True
+)
+bm25_index.save("output/bm25_index")
+# Loading index and use BM25 retriever to retrieve:
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?"))  # the ranking
+plots_b: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+plots_k1: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+## YOUR_CODE_STARTS_HERE
+# Two steps should be involved:
+# Step 1. Fix k1 value to the default one 0.9,
+# go through all the candidate b values (0, 0.1, ..., 1.0),
+# and record in plots_b["Y"] the corresponding performances obtained via evaluate_map;
+# Step 2. Fix b to the best one in step 1. and do the same for k1.
+# Hint (on using the pre-requisite code):
+# - One can use the loaded sciq dataset directly (loaded in the pre-requisite code);
+# - One can build bm25_index with `BM25Index.build_from_documents`;
+# - One can use BM25Retriever to load the index and perform retrieval on the dev queries
+# (dev queries can be obtained via sciq.get_split_queries(Split.dev))
+k1_fixed = 0.9
+best_b = 0.0
+best_map = 0.0
+# fine-tuning b
+for b in plots_b["X"]:
+    #Build
+    bm25_index = BM25Index.build_from_documents(
+        documents=iter(sciq.corpus),
+        ndocs=len(sciq.corpus),
+        b=b,
+        k1=k1_fixed,
+        show_progress_bar=True
+    )
+    bm25_index.save("output/bm25_index")
+    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+    #Retrieval
+    dev_queries = sciq.get_split_queries(Split.dev)
+    rankings = {}
+    print(dev_queries)
+    for query in dev_queries:
+        qid = query.query_id
+        rankings[qid] = bm25_retriever.retrieve(query.text)
+    # Evaluation
+    current_map = evaluate_map(rankings, split=Split.dev)
+    plots_b["Y"].append(current_map)
+    if current_map > best_map:
+        best_map = current_map
+        best_b = b
+print(f"Best b value: {best_b} with MAP@10: {best_map}")
+# fine-tuning k1
+for k1 in plots_k1["X"]:
+    # Build
+    bm25_index = BM25Index.build_from_documents(
+        documents=iter(sciq.corpus),
+        ndocs=len(sciq.corpus),
+        b=best_b,
+        k1=k1,
+        show_progress_bar=True
+    )
+    bm25_index.save("output/bm25_index")
+    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+    # Retrieval
+    dev_queries = sciq.get_split_queries(Split.dev)
+    rankings = {}
+    for query in dev_queries:
+        rankings[query.query_id] = bm25_retriever.retrieve(query.text)
+    # Evaluation
+    current_map = evaluate_map(rankings, split=Split.dev)
+    plots_k1["Y"].append(current_map)
+best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
+best_k1_map = max(plots_k1["Y"])
+## YOU_CODE_ENDS_HERE
+## TEST_CASES (should be close to 0.8135637188208616 and 0.7512916099773244)
+print(plots_k1["Y"][9])
+print(plots_b["Y"][1])
+## RESULT_CHECKING_POINT
+print(plots_k1)
+print(plots_b)
+from matplotlib import pyplot as plt
+plt.plot(plots_b["X"], plots_b["Y"], label="b")
+plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
+plt.ylabel("MAP")
+plt.legend()
+plt.grid()
+plt.show()
+"""Let's check the effectiveness gain on test after this tuning on dev"""
+default_map = 0.7849
+best_b = plots_b["X"][np.argmax(plots_b["Y"])]
+best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
+bm25_index.save("output/bm25_index")
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+rankings = {}
+for query in sciq.get_split_queries(Split.test):  # note this is now on test
+  ranking = bm25_retriever.retrieve(query=query.text)
+  rankings[query.query_id] = ranking
+optimized_map = evaluate_map(rankings, split=Split.test)  # note this is now on test
+print(default_map, optimized_map)
+"""# TASK2: CSC matrix and `CSCBM25Index` (12 points)
+Recall that we use Python lists to implement posting lists, mapping term IDs to the documents in which they appear. This is inefficient due to its naive design. Actually [Compressed Sparse Column matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html) is very suitable for storing the posting lists and can boost the efficiency.
+## TASK2.1: learn about `scipy.sparse.csc_matrix` (2 point)
+Convert the matrix \begin{bmatrix}
+0 & 1 & 0 & 3 \\
+10 & 2 & 1 & 0 \\
+0 & 0 & 0 & 9
+\end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`.
+"""
+from scipy.sparse._csc import csc_matrix
+input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]]
+data = None
+indices = None
+indptr = None
+shape = None
+## YOUR_CODE_STARTS_HERE
+# Please assign the values to data, indices, indptr and shape
+# One can just do it in a hard-coded manner
+data = [10, 1, 2, 1, 3, 9]
+indices = [1, 0, 1, 1, 0, 2]
+indptr = [0, 1, 3, 4, 6]
+shape = (3, 4)
+## YOUR_CODE_ENDS_HERE
+output_matrix = csc_matrix((data, indices, indptr), shape=shape)
+## TEST_CASES (should be 3 and 11)
+print((output_matrix.indices + output_matrix.data).tolist()[2])
+print((output_matrix.indices + output_matrix.data).tolist()[-1])
+## RESULT_CHECKING_POINT
+print((output_matrix.indices + output_matrix.data).tolist())
+"""## TASK2.2: implement `CSCBM25Index` (4 points)
+Implement `CSCBM25Index` by completing the missing code. Note that `CSCInvertedIndex` is similar to `InvertedIndex` which we talked about during the class. The main difference is posting lists are represented by a CSC sparse matrix.
+"""
+@dataclass
+class CSCInvertedIndex:
+    posting_lists_matrix: csc_matrix  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+@dataclass
+class CSCBM25Index(CSCInvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> csc_matrix:
+        """Compute term weights and caching"""
+        ## YOUR_CODE_STARTS_HERE
+        data = []
+        row_idx = []
+        col_idx = []
+        for term_idx, posting_list in enumerate(posting_lists):
+            df = dfs[term_idx]
+            idf = CSCBM25Index.calc_idf(df, total_docs)
+            doc_ids = posting_list.docid_postings
+            term_freqs = posting_list.tweight_postings
+            for doc_id, tf in zip(doc_ids, term_freqs):
+                dl = dls[doc_id]
+                regularized_tf = CSCBM25Index.calc_regularized_tf(tf, dl, avgdl, k1, b)
+                weight = idf * regularized_tf
+                data.append(weight)
+                row_idx.append(doc_id)
+                col_idx.append(term_idx)
+        term_weights = csc_matrix((np.array(data, dtype=np.float32), (row_idx, col_idx)), shape=(total_docs, len(posting_lists)))
+        return term_weights
+        ## YOUR_CODE_ENDS_HERE
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[CSCBM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> CSCBM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=CSCBM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        posting_lists_matrix = CSCBM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = CSCBM25Index(
+            posting_lists_matrix=posting_lists_matrix,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+csc_bm25_index = CSCBM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
+csc_bm25_index.save("output/csc_bm25_index")
+## TEST_CASES (should be 7 and 95)
+print(len(str(os.path.getsize("output/csc_bm25_index/index.pkl"))))
+print(os.path.getsize("output/csc_bm25_index/index.pkl") // int(1e5))
+## RESULT_CHECKING_POINT
+print(os.path.getsize("output/csc_bm25_index/index.pkl"))
+"""We can compare the size of the CSC-based index with the Python-list-based index:"""
+print(os.path.getsize("output/bm25_index/index.pkl"))
+"""## TASK2.3: implement `CSCInvertedIndexRetriever` (6 points)
+Implement `CSCInvertedIndexRetriever` by completing the missing code.
+"""
+class BaseCSCInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[CSCInvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        ## YOUR_CODE_STARTS_HERE
+        doc_id = self.index.cid2docid[cid]  # Map collection ID to document ID
+        vocab = self.index.vocab
+        term_weights = {}
+        # Tokenize the query
+        query_terms = self.index.tokenize(query)
+        for term in query_terms:
+            if term in vocab:
+                term_idx = vocab[term]
+                start, end = self.index.posting_lists_matrix.indptr[term_idx], self.index.posting_lists_matrix.indptr[term_idx + 1]
+                doc_ids = self.index.posting_lists_matrix.indices[start:end]
+                weights = self.index.posting_lists_matrix.data[start:end]
+                # Check if the term appears in the document
+                if doc_id in doc_ids:
+                    term_weights[term] = weights[list(doc_ids).index(doc_id)]
+        return term_weights
+        ## YOUR_CODE_ENDS_HERE
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        ## YOUR_CODE_STARTS_HERE
+        query_terms = self.index.tokenize(query)
+        vocab = self.index.vocab
+        scores = {}
+        for term in query_terms:
+            if term in vocab:
+                term_idx = vocab[term]
+                start, end = self.index.posting_lists_matrix.indptr[term_idx], self.index.posting_lists_matrix.indptr[term_idx + 1]
+                doc_ids = self.index.posting_lists_matrix.indices[start:end]
+                weights = self.index.posting_lists_matrix.data[start:end]
+                for doc_id, weight in zip(doc_ids, weights):
+                    scores[doc_id] = scores.get(doc_id, 0.0) + weight
+        scored_docs = {self.index.collection_ids[doc_id]: score for doc_id, score in scores.items()}
+        return dict(sorted(scored_docs.items(), key=lambda item: item[1], reverse=True)[:topk])
+        ## YOUR_CODE_ENDS_HERE
+class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[CSCBM25Index]:
+        return CSCBM25Index
+## TEST_CASES (should be close to
+# {'theory': 3.1838157176971436, 'evolution': 3.488086223602295, 'natural': 2.629807710647583, 'selection': 3.552377462387085}
+# {'train-11632': 16.241527557373047, 'train-10931': 13.352127075195312, 'train-2006': 12.854086875915527, 'train-7040': 12.690572738647461, 'train-1719': 11.01913833618164, 'train-9875': 10.886155128479004, 'train-1971': 10.796306610107422, 'train-9882': 10.535819053649902, 'train-2018': 10.481085777282715, 'test-586': 10.478515625}
+#)
+csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
+query = "Who proposed the theory of evolution by natural selection?"
+print(csc_bm25_retriever.get_term_weights(query=query, cid="train-2006"))
+print(csc_bm25_retriever.retrieve(query))
+## RESULT_CHECKING_POINT
+csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
+query = "What are the differences between immunodeficiency and autoimmune diseases?"
+print(csc_bm25_retriever.get_term_weights(query=query, cid="train-1691"))
+print(csc_bm25_retriever.retrieve("What are the differences between immunodeficiency and autoimmune diseases?"))
+"""# TASK3: a search-engine demo based on Huggingface space (4 points)
+## TASK3.1: create the gradio app (2 point)
+Create a gradio app to demo the BM25 search engine index on SciQ. The app should have a single input variable for the query (of type `str`) and a single output variable for the returned ranking (of type `List[Hit]` in the code below). Please use the BM25 system with default k1 and b values.
+Hint: it should use a "search" function of signature:
+```python
+def search(query: str) -> List[Hit]:
+  ...
+```
+"""
+!pip install gradio
+import gradio as gr
+from typing import TypedDict
+class Hit(TypedDict):
+  cid: str
+  score: float
+  text: str
+demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
+return_type = List[Hit]
+## YOUR_CODE_STARTS_HERE
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True
+)
+def search(query: str) -> List[Hit]:
+    """
+    Search function that performs BM25 search on the SciQ dataset.
+    Args:
+        query: Search query string
+    Returns:
+        List[Hit]: List of search results in Hit format
+    """
+    results = bm25_retriever.retrieve(query)
+    hits: List[Hit] = []
+    for cid, score in results.items():
+        docid = bm25_retriever.index.cid2docid[cid]
+        hit: Hit = Hit(
+            cid=cid,
+            score=float(score),
+            text=bm25_retriever.index.doc_texts[docid] if bm25_retriever.index.doc_texts else ""
+        )
+        hits.append(hit)
+    return hits
+demo = gr.Interface(
+    fn=search,
+    inputs="text",
+    outputs="text",
+)
+## YOUR_CODE_ENDS_HERE
+demo.launch()
+## TEST_CASES (result should be [{'cid': 'train-10966', 'score': 12.417802868109781, 'text': 'Bacteria can be used to make cheese from milk. The bacteria turn the milk sugars into lactic acid. The acid is what causes the milk to curdle to form cheese. Bacteria are also involved in producing other foods. Yogurt is made by using bacteria to ferment milk ( Figure below ). Fermenting cabbage with bacteria produces sauerkraut.'}, {'cid': 'train-0', 'score': 10.702840907292215, 'text': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}, {'cid': 'dev-569', 'score': 9.78520518303728, 'text': 'A wide range of friendly bacteria live in the gut. Bacteria begin to populate the human digestive system right after birth. Gut bacteria include Lactobacillus , the bacteria commonly used in probiotic foods such as yogurt, and E. coli bacteria. About a third of all bacteria in the gut are members of the Bacteroides species. Bacteroides are key in helping us digest plant food.'}, {'cid': 'train-1133', 'score': 8.292180216871554, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement.'}, {'cid': 'train-5314', 'score': 8.211635318028303, 'text': 'Bacteria are often used to make cheese from milk. But making foods is not the only beneficial role of bacteria. For example, they also play an essential role in your gut!.'}, {'cid': 'train-6684', 'score': 8.168255107424818, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement. Other sources of calcium are pictured in the Figure below .'}, {'cid': 'train-7890', 'score': 7.930578384187305, 'text': 'Animals and some bacteria and fungi carry out lactic acid fermentation. Lactic acid is a waste product of this process. Our muscles perform lactic acid fermentation during strenuous exercise, since oxygen cannot be delivered to the muscles quickly enough. The buildup of lactic acid is believed to make your muscles sore after exercise. Bacteria that produce lactic acid are used to make cheese and yogurt. The lactic acid causes the proteins in milk to thicken. Lactic acid also causes tooth decay, because bacteria use the sugars in your mouth for energy.'}, {'cid': 'train-6916', 'score': 7.833677059320589, 'text': 'Yogurt is a good source of calcium. Yogurt also contains active cultures of "good" bacteria. Foods that contain these beneficial bacteria are sometimes called "probiotic. ".'}, {'cid': 'train-10029', 'score': 7.725028405457634, 'text': 'Humans have collected and grown mushrooms for food for thousands of years. Figure below shows some of the many types of mushrooms that people eat. Yeasts are used in bread baking and brewing alcoholic beverages. Other fungi are used in fermenting a wide variety of foods, including soy sauce, tempeh, and cheeses. Blue cheese has its distinctive appearance and flavor because of the fungus growing though it (see Figure below ).'}, {'cid': 'train-10983', 'score': 7.334055808872751, 'text': "No doubt you've had a sore throat before, and you've probably eaten cheese or yogurt. If so, then you've already encountered the amazing world of prokaryotes. Prokaryotes are single-celled organisms that lack a nucleus. They also lack other membrane-bound organelles. Prokaryotes are tiny. They can only be viewed with a microscope (see Figure below ). But they are the most numerous organisms on Earth. Without them, the world would be a very different place."}])
+import requests
+import json
+headers = {"Content-Type": "application/json"}
+data = {"data": ["What type of organism is commonly used in preparation of foods such as cheese and yogurt?"]}
+response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
+event_id = response.json()["event_id"]
+response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
+lines = list(response.iter_lines())
+print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
+## RESULT_CHECKING_POINT
+import requests
+import json
+headers = {"Content-Type": "application/json"}
+data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
+response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
+event_id = response.json()["event_id"]
+response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
+lines = list(response.iter_lines())
+print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
+"""## TASK3.2: upload it to Huggingface Space (2 point)
+Upload your gradio app to Huggingface Space. Put your URL to the Space app in the variable `hf_space_url`.
+IMPORTANT!!! You can get this URL from:
+*Your Space page* -> *"three dots" on the top right* -> "embedd this space" -> "Direct URL"
+An example URL (not for our task) is: https://stabilityai-stable-diffusion-3-5-large.hf.space (from https://huggingface.co/spaces/stabilityai/stable-diffusion-3.5-large)
+"""
+hf_space_url: Optional[str] = "https://dogukan-bg-nlp4webspace.hf.space" # Store your created Huggingface Space URL in this variable
+## YOUR_CODE_STARTS_HERE
+## YOUR_CODE_ENDS_HERE
+## RESULT_CHECKING_POINT
+import requests
+import json
+print(hf_space_url)
+headers = {"Content-Type": "application/json"}
+data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
+response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
+event_id = response.json()["event_id"]
+response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
+lines = list(response.iter_lines())
+print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
+## TEST_CASES  (result should be [{'cid': 'train-5587', 'score': 26.74537329473182, 'text': 'The entropy change is positive as the solid state changes into the liquid state. If the transition went from the liquid to the solid state, the numerical value for would be the same, but the sign would be reversed since we are going from a less ordered to a more ordered situation.'}, {'cid': 'train-2', 'score': 25.93532475963942, 'text': 'Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization.'}, {'cid': 'train-1658', 'score': 19.0263955721366, 'text': 'There are many examples in the chemical world of changes in entropy. Phase transitions are one obvious example. When a substance makes a transition from the liquid state to the gaseous state, the particles have many more possible arrangements, because they are no longer confined to a specified volume in which they are close to each other; gas particles can move freely throughout their container. Vaporization represents an increase in entropy. In the opposite direction, a liquid loses entropy when it freezes to a solid. Because solids have very ordered structures, there are fewer possible arrangements of particles that would result in the properties associated with a solid.'}, {'cid': 'train-5603', 'score': 16.14918704233498, 'text': 'Chemical energy, the energy stored in molecules and atoms, is one type of potential energy. Certain reactions can cause this energy to be released as heat. Other reactions require an input of energy, in which case the products will store more potential energy than the reactants. When we studied phase changes, we saw a relationship between energy and the state of matter. To melt a solid or boil a liquid, energy needs to be added in order to break up the intermolecular forces holding particles together in more ordered states. The reverse processes, condensation and freezing, release energy, because more favorable intermolecular interactions are formed.'}, {'cid': 'train-8144', 'score': 13.369317026860408, 'text': 'Solid carbon dioxide is also called dry ice. That’s because when it gets warmer and changes state, it doesn’t change to a liquid by melting. Instead, it changes directly to a gas without going through the liquid state. The process in which a solid changes directly to a gas is called sublimation . It occurs when energy is added to a solid such as dry ice. You can watch dry ice changing directly to a gas in the video at this URL: http://www. youtube. com/watch?v=J8mDGwf-5x0 .'}, {'cid': 'train-844', 'score': 12.931270408607555, 'text': 'The water droplets of fog form from water vapor in the air. Fog disappears when the water droplets change back to water vapor. These changes are examples of changes of state. A change of state occurs whenever matter changes from one state to another. Common states of matter on Earth are solid, liquid, and gas. Matter may change back and forth between any two of these states.'}, {'cid': 'train-9811', 'score': 12.904636038613848, 'text': 'Start right above point on the temperature axis and follow the red line vertically. At very low pressure, the particles of the substance are far apart from one another and the substance is in the gas state. As the pressure is increased, the particles of the substance are forced closer and closer together. Eventually the particles are pushed so close together that attractive forces cause the substance to condense into the liquid state. Continually increasing the pressure on the liquid will eventually cause the substance to solidify. For the majority of substances, the solid state is denser than the liquid state and so putting a liquid under great pressure will cause it to turn into a solid. The line segment represents the process of sublimation, where the substance changes directly from a solid to a gas. At a sufficiently low pressure, the liquid phase does not exist. The point labeled is called the triple point . The triple point is the one condition of temperature and pressure where the solid, liquid, and vapor states of a substance can all coexist at equilibrium.'}, {'cid': 'train-8260', 'score': 12.876342252900347, 'text': 'Unlike a crystalline solid, an amorphous solid is a solid that lacks an ordered internal structure. Some examples of amorphous solids include rubber, plastic, and gels. Glass is a very important amorphous solid that is made by cooling a mixture of materials in such a way that it does not crystallize. Glass is sometimes referred to as a supercooled liquid rather than a solid. If you have ever watched a glassblower in action, you have noticed that he takes advantage of the fact that amorphous solids do not have a distinct melting point like crystalline solids do. Instead, as glass is heated, it slowly softens and can be shaped into all sorts of interesting forms. When a glass object shatters, it does so in a very irregular way, unlike crystalline solids, which always break into fragments that have the same shape as dictated by its crystal system.'}, {'cid': 'train-317', 'score': 12.82403749702155, 'text': 'An amorphous solid is a solid that lacks an ordered internal structure.'}, {'cid': 'train-6203', 'score': 12.76684203292532, 'text': 'Matter can exist in one of several different states, including a gas, liquid, or solid state. States of matter differ in the amount of energy their molecules have. When matter recycles, it changes state by gaining or losing energy.'}]
+import requests
+import json
+headers = {"Content-Type": "application/json"}
+data = {"data": ["Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what?"]}
+response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
+event_id = response.json()["event_id"]
+response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
+lines = list(response.iter_lines())
+print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))

nlp4web-codebase/.gitignore ADDED Viewed

	@@ -0,0 +1,134 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+*.tsv
+*.jsonl
+*.zip
+output/

nlp4web-codebase/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # nlp4web
2	+ Codebase of teaching materials for NLP4Web.

nlp4web-codebase/nlp4web_codebase/__init__.py ADDED Viewed

File without changes

nlp4web-codebase/nlp4web_codebase/ir/__init__.py ADDED Viewed

File without changes

nlp4web-codebase/nlp4web_codebase/ir/analysis.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+from typing import Dict, List, Optional, Protocol
+import pandas as pd
+import tqdm
+import ujson
+from nlp4web_codebase.ir.data_loaders import IRDataset
+def round_dict(obj: Dict[str, float], ndigits: int = 4) -> Dict[str, float]:
+    return {k: round(v, ndigits=ndigits) for k, v in obj.items()}
+def sort_dict(obj: Dict[str, float], reverse: bool = True) -> Dict[str, float]:
+    return dict(sorted(obj.items(), key=lambda pair: pair[1], reverse=reverse))
+def save_ranking_results(
+    output_dir: str,
+    query_ids: List[str],
+    rankings: List[Dict[str, float]],
+    query_performances_lists: List[Dict[str, float]],
+    cid2tweights_lists: Optional[List[Dict[str, Dict[str, float]]]] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "ranking_results.jsonl")
+    rows = []
+    for i, (query_id, ranking, query_performances) in enumerate(
+        zip(query_ids, rankings, query_performances_lists)
+    ):
+        row = {
+            "query_id": query_id,
+            "ranking": round_dict(ranking),
+            "query_performances": round_dict(query_performances),
+            "cid2tweights": {},
+        }
+        if cid2tweights_lists is not None:
+            row["cid2tweights"] = {
+                cid: round_dict(tws) for cid, tws in cid2tweights_lists[i].items()
+            }
+        rows.append(row)
+    pd.DataFrame(rows).to_json(
+        output_path,
+        orient="records",
+        lines=True,
+    )
+class TermWeightingFunction(Protocol):
+    def __call__(self, query: str, cid: str) -> Dict[str, float]: ...
+def compare(
+    dataset: IRDataset,
+    results_path1: str,
+    results_path2: str,
+    output_dir: str,
+    main_metric: str = "recip_rank",
+    system1: Optional[str] = None,
+    system2: Optional[str] = None,
+    term_weighting_fn1: Optional[TermWeightingFunction] = None,
+    term_weighting_fn2: Optional[TermWeightingFunction] = None,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    df1 = pd.read_json(results_path1, orient="records", lines=True)
+    df2 = pd.read_json(results_path2, orient="records", lines=True)
+    assert len(df1) == len(df2)
+    all_qrels = {}
+    for split in dataset.split2qrels:
+        all_qrels.update(dataset.get_qrels_dict(split))
+    qid2query = {query.query_id: query for query in dataset.queries}
+    cid2doc = {doc.collection_id: doc for doc in dataset.corpus}
+    diff_col = f"{main_metric}:qp1-qp2"
+    merged = pd.merge(df1, df2, on="query_id", how="outer")
+    rows = []
+    for _, example in tqdm.tqdm(merged.iterrows(), desc="Comparing", total=len(merged)):
+        docs = {cid: cid2doc[cid].text for cid in dict(example["ranking_x"])}
+        docs.update({cid: cid2doc[cid].text for cid in dict(example["ranking_y"])})
+        query_id = example["query_id"]
+        row = {
+            "query_id": query_id,
+            "query": qid2query[query_id].text,
+            diff_col: example["query_performances_x"][main_metric]
+            - example["query_performances_y"][main_metric],
+            "ranking1": ujson.dumps(example["ranking_x"], indent=4),
+            "ranking2": ujson.dumps(example["ranking_y"], indent=4),
+            "docs": ujson.dumps(docs, indent=4),
+            "query_performances1": ujson.dumps(
+                example["query_performances_x"], indent=4
+            ),
+            "query_performances2": ujson.dumps(
+                example["query_performances_y"], indent=4
+            ),
+            "qrels": ujson.dumps(all_qrels[query_id], indent=4),
+        }
+        if term_weighting_fn1 is not None and term_weighting_fn2 is not None:
+            all_cids = set(example["ranking_x"]) | set(example["ranking_y"])
+            cid2tweights1 = {}
+            cid2tweights2 = {}
+            ranking1 = {}
+            ranking2 = {}
+            for cid in all_cids:
+                tweights1 = term_weighting_fn1(query=qid2query[query_id].text, cid=cid)
+                tweights2 = term_weighting_fn2(query=qid2query[query_id].text, cid=cid)
+                ranking1[cid] = sum(tweights1.values())
+                ranking2[cid] = sum(tweights2.values())
+                cid2tweights1[cid] = tweights1
+                cid2tweights2[cid] = tweights2
+            ranking1 = sort_dict(ranking1)
+            ranking2 = sort_dict(ranking2)
+            row["ranking1"] = ujson.dumps(ranking1, indent=4)
+            row["ranking2"] = ujson.dumps(ranking2, indent=4)
+            cid2tweights1 = {cid: cid2tweights1[cid] for cid in ranking1}
+            cid2tweights2 = {cid: cid2tweights2[cid] for cid in ranking2}
+            row["cid2tweights1"] = ujson.dumps(cid2tweights1, indent=4)
+            row["cid2tweights2"] = ujson.dumps(cid2tweights2, indent=4)
+        rows.append(row)
+    table = pd.DataFrame(rows).sort_values(by=diff_col, ascending=False)
+    output_path = os.path.join(output_dir, f"compare-{system1}_vs_{system2}.tsv")
+    table.to_csv(output_path, sep="\t", index=False)
+# if __name__ == "__main__":
+#     # python -m lecture2.bm25.analysis
+#     from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+#     from lecture2.bm25.bm25_retriever import BM25Retriever
+#     from lecture2.bm25.tfidf_retriever import TFIDFRetriever
+#     import numpy as np
+#     sciq = load_sciq()
+#     system1 = "bm25"
+#     system2 = "tfidf"
+#     results_path1 = f"output/sciq-{system1}/results/ranking_results.jsonl"
+#     results_path2 = f"output/sciq-{system2}/results/ranking_results.jsonl"
+#     index_dir1 = f"output/sciq-{system1}"
+#     index_dir2 = f"output/sciq-{system2}"
+#     compare(
+#         dataset=sciq,
+#         results_path1=results_path1,
+#         results_path2=results_path2,
+#         output_dir=f"output/sciq-{system1}_vs_{system2}",
+#         system1=system1,
+#         system2=system2,
+#         term_weighting_fn1=BM25Retriever(index_dir1).get_term_weights,
+#         term_weighting_fn2=TFIDFRetriever(index_dir2).get_term_weights,
+#     )
+#     # bias on #shared_terms of TFIDF:
+#     df1 = pd.read_json(results_path1, orient="records", lines=True)
+#     df2 = pd.read_json(results_path2, orient="records", lines=True)
+#     merged = pd.merge(df1, df2, on="query_id", how="outer")
+#     nterms1 = []
+#     nterms2 = []
+#     for _, row in merged.iterrows():
+#         nterms1.append(len(list(dict(row["cid2tweights_x"]).values())[0]))
+#         nterms2.append(len(list(dict(row["cid2tweights_y"]).values())[0]))
+#     percentiles = (5, 25, 50, 75, 95)
+#     print(system1, np.percentile(nterms1, percentiles), np.mean(nterms1).round(2))
+#     print(system2, np.percentile(nterms2, percentiles), np.mean(nterms2).round(2))
+#     # bm25 [ 3.  4.  5.  7. 11.] 5.64
+#     # tfidf [1. 2. 3. 5. 9.] 3.58

nlp4web-codebase/nlp4web_codebase/ir/data_loaders/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+class Split(str, Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+@dataclass
+class IRDataset:
+    corpus: List[Document]
+    queries: List[Query]
+    split2qrels: Dict[Split, List[QRel]]
+    def get_stats(self) -> Dict[str, int]:
+        stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
+        for split, qrels in self.split2qrels.items():
+            stats[f"|qrels-{split}|"] = len(qrels)
+        return stats
+    def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
+        qrels_dict = {}
+        for qrel in self.split2qrels[split]:
+            qrels_dict.setdefault(qrel.query_id, {})
+            qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
+        return qrels_dict
+    def get_split_queries(self, split: Split) -> List[Query]:
+        qrels = self.split2qrels[split]
+        qids = {qrel.query_id for qrel in qrels}
+        return list(filter(lambda query: query.query_id in qids, self.queries))

nlp4web-codebase/nlp4web_codebase/ir/data_loaders/dm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Document:
+    collection_id: str
+    text: str
+@dataclass
+class Query:
+    query_id: str
+    text: str
+@dataclass
+class QRel:
+    query_id: str
+    collection_id: str
+    relevance: int
+    answer: Optional[str] = None

nlp4web-codebase/nlp4web_codebase/ir/data_loaders/sciq.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders import IRDataset, Split
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+from datasets import load_dataset
+import joblib
+@(joblib.Memory(".cache").cache)
+def load_sciq(verbose: bool = False) -> IRDataset:
+    train = load_dataset("allenai/sciq", split="train")
+    validation = load_dataset("allenai/sciq", split="validation")
+    test = load_dataset("allenai/sciq", split="test")
+    data = {Split.train: train, Split.dev: validation, Split.test: test}
+    # Each duplicated record is the same to each other:
+    df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
+    for question, group in df.groupby("question"):
+        assert len(set(group["support"].tolist())) == len(group)
+        assert len(set(group["correct_answer"].tolist())) == len(group)
+    # Build:
+    corpus = []
+    queries = []
+    split2qrels: Dict[str, List[dict]] = {}
+    question2id = {}
+    support2id = {}
+    for split, rows in data.items():
+        if verbose:
+            print(f"|raw_{split}|", len(rows))
+        split2qrels[split] = []
+        for i, row in enumerate(rows):
+            example_id = f"{split}-{i}"
+            support: str = row["support"]
+            if len(support.strip()) == 0:
+                continue
+            question = row["question"]
+            if len(support.strip()) == 0:
+                continue
+            if support in support2id:
+                continue
+            else:
+                support2id[support] = example_id
+            if question in question2id:
+                continue
+            else:
+                question2id[question] = example_id
+            doc = {"collection_id": example_id, "text": support}
+            query = {"query_id": example_id, "text": row["question"]}
+            qrel = {
+                "query_id": example_id,
+                "collection_id": example_id,
+                "relevance": 1,
+                "answer": row["correct_answer"],
+            }
+            corpus.append(Document(**doc))
+            queries.append(Query(**query))
+            split2qrels[split].append(QRel(**qrel))
+    # Assembly and return:
+    return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
+if __name__ == "__main__":
+    # python -m nlp4web_codebase.ir.data_loaders.sciq
+    import ujson
+    import time
+    start = time.time()
+    dataset = load_sciq(verbose=True)
+    print(f"Loading costs: {time.time() - start}s")
+    print(ujson.dumps(dataset.get_stats(), indent=4))
+    # ________________________________________________________________________________
+    # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq...
+    # load_sciq(verbose=True)
+    # |raw_train| 11679
+    # |raw_dev| 1000
+    # |raw_test| 1000
+    # ________________________________________________________load_sciq - 7.3s, 0.1min
+    # Loading costs: 7.260092735290527s
+    # {
+    #     "|corpus|": 12160,
+    #     "|queries|": 12160,
+    #     "|qrels-train|": 10409,
+    #     "|qrels-dev|": 875,
+    #     "|qrels-test|": 876
+    # }

nlp4web-codebase/nlp4web_codebase/ir/models/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, Type
+class BaseRetriever(ABC):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[Any]:
+        pass
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        raise NotImplementedError
+    @abstractmethod
+    def score(self, query: str, cid: str) -> float:
+        pass
+    @abstractmethod
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        pass

nlp4web-codebase/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ .

nlp4web-codebase/setup.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    readme = fh.read()
+setup(
+    name="nlp4web-codebase",
+    version="0.0.0",
+    author="Kexin Wang",
+    author_email="[email protected]",
+    description="Codebase of teaching materials for NLP4Web.",
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    url="https://https://github.com/kwang2049/nlp4web-codebase",
+    project_urls={
+        "Bug Tracker": "https://github.com/kwang2049/nlp4web-codebase/issues",
+    },
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.10",
+    install_requires=[
+        "nltk==3.8.1",
+        "numpy==1.26.4",
+        "scipy==1.13.1",
+        "pandas==2.2.2",
+        "tqdm==4.66.5",
+        "ujson==5.10.0",
+        "joblib==1.4.2",
+        "datasets==3.0.1",
+        "pytrec_eval==0.5",
+    ],
+)

output/bm25_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25d3bc81c91354ee366eedda530282e3fff9431d7069327c35481c0ff7ca9702
+size 11624459

output/csc_bm25_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b0432479f1bca45512e6864a13931538d1f1afed9dc9888febac088d4d2deb4
+size 9522928