dogukan-bg commited on
Commit
e77e36b
·
verified ·
1 Parent(s): 5c2f03c

Upload folder using huggingface_hub

Browse files
app.py ADDED
@@ -0,0 +1,888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of HW1 (more instructed).ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/14KOszSHjoAmlL_IGF2Ixz_3rdI4I3Z-J
8
+ """
9
+
10
+ !pip install git+https://github.com/kwang2049/nlp4web-codebase.git
11
+ !git clone https://github.com/kwang2049/nlp4web-codebase.git # You can always check the content of this simple codebase at any time
12
+ !pip install gradio # we also need this additionally for this homework
13
+
14
+ """## Pre-requisite code
15
+
16
+ The code within this section will be used in the tasks. Please do not change these code lines.
17
+
18
+ ### SciQ loading and counting
19
+ """
20
+
21
+ from dataclasses import dataclass
22
+ import pickle
23
+ import os
24
+ from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
25
+ from nlp4web_codebase.ir.data_loaders.dm import Document
26
+ from collections import Counter
27
+ import tqdm
28
+ import re
29
+ import nltk
30
+ nltk.download("stopwords", quiet=True)
31
+ from nltk.corpus import stopwords as nltk_stopwords
32
+
33
+ LANGUAGE = "english"
34
+ word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
35
+ stopwords = set(nltk_stopwords.words(LANGUAGE))
36
+
37
+
38
+ def word_splitting(text: str) -> List[str]:
39
+ return word_splitter(text.lower())
40
+
41
+ def lemmatization(words: List[str]) -> List[str]:
42
+ return words # We ignore lemmatization here for simplicity
43
+
44
+ def simple_tokenize(text: str) -> List[str]:
45
+ words = word_splitting(text)
46
+ tokenized = list(filter(lambda w: w not in stopwords, words))
47
+ tokenized = lemmatization(tokenized)
48
+ return tokenized
49
+
50
+ T = TypeVar("T", bound="InvertedIndex")
51
+
52
+ @dataclass
53
+ class PostingList:
54
+ term: str # The term
55
+ docid_postings: List[int] # docid_postings[i] means the docid (int) of the i-th associated posting
56
+ tweight_postings: List[float] # tweight_postings[i] means the term weight (float) of the i-th associated posting
57
+
58
+
59
+ @dataclass
60
+ class InvertedIndex:
61
+ posting_lists: List[PostingList] # docid -> posting_list
62
+ vocab: Dict[str, int]
63
+ cid2docid: Dict[str, int] # collection_id -> docid
64
+ collection_ids: List[str] # docid -> collection_id
65
+ doc_texts: Optional[List[str]] = None # docid -> document text
66
+
67
+ def save(self, output_dir: str) -> None:
68
+ os.makedirs(output_dir, exist_ok=True)
69
+ with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
70
+ pickle.dump(self, f)
71
+
72
+ @classmethod
73
+ def from_saved(cls: Type[T], saved_dir: str) -> T:
74
+ index = cls(
75
+ posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
76
+ )
77
+ with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
78
+ index = pickle.load(f)
79
+ return index
80
+
81
+
82
+ # The output of the counting function:
83
+ @dataclass
84
+ class Counting:
85
+ posting_lists: List[PostingList]
86
+ vocab: Dict[str, int]
87
+ cid2docid: Dict[str, int]
88
+ collection_ids: List[str]
89
+ dfs: List[int] # tid -> df
90
+ dls: List[int] # docid -> doc length
91
+ avgdl: float
92
+ nterms: int
93
+ doc_texts: Optional[List[str]] = None
94
+
95
+ def run_counting(
96
+ documents: Iterable[Document],
97
+ tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
98
+ store_raw: bool = True, # store the document text in doc_texts
99
+ ndocs: Optional[int] = None,
100
+ show_progress_bar: bool = True,
101
+ ) -> Counting:
102
+ """Counting TFs, DFs, doc_lengths, etc."""
103
+ posting_lists: List[PostingList] = []
104
+ vocab: Dict[str, int] = {}
105
+ cid2docid: Dict[str, int] = {}
106
+ collection_ids: List[str] = []
107
+ dfs: List[int] = [] # tid -> df
108
+ dls: List[int] = [] # docid -> doc length
109
+ nterms: int = 0
110
+ doc_texts: Optional[List[str]] = []
111
+ for doc in tqdm.tqdm(
112
+ documents,
113
+ desc="Counting",
114
+ total=ndocs,
115
+ disable=not show_progress_bar,
116
+ ):
117
+ if doc.collection_id in cid2docid:
118
+ continue
119
+ collection_ids.append(doc.collection_id)
120
+ docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
121
+ toks = tokenize_fn(doc.text)
122
+ tok2tf = Counter(toks)
123
+ dls.append(sum(tok2tf.values()))
124
+ for tok, tf in tok2tf.items():
125
+ nterms += tf
126
+ tid = vocab.get(tok, None)
127
+ if tid is None:
128
+ posting_lists.append(
129
+ PostingList(term=tok, docid_postings=[], tweight_postings=[])
130
+ )
131
+ tid = vocab.setdefault(tok, len(vocab))
132
+ posting_lists[tid].docid_postings.append(docid)
133
+ posting_lists[tid].tweight_postings.append(tf)
134
+ if tid < len(dfs):
135
+ dfs[tid] += 1
136
+ else:
137
+ dfs.append(0)
138
+ if store_raw:
139
+ doc_texts.append(doc.text)
140
+ else:
141
+ doc_texts = None
142
+ return Counting(
143
+ posting_lists=posting_lists,
144
+ vocab=vocab,
145
+ cid2docid=cid2docid,
146
+ collection_ids=collection_ids,
147
+ dfs=dfs,
148
+ dls=dls,
149
+ avgdl=sum(dls) / len(dls),
150
+ nterms=nterms,
151
+ doc_texts=doc_texts,
152
+ )
153
+
154
+ from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
155
+ sciq = load_sciq()
156
+ counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
157
+
158
+ """### BM25 Index"""
159
+
160
+ from __future__ import annotations
161
+ from dataclasses import asdict, dataclass
162
+ import math
163
+ import os
164
+ from typing import Iterable, List, Optional, Type
165
+ import tqdm
166
+ from nlp4web_codebase.ir.data_loaders.dm import Document
167
+
168
+
169
+ @dataclass
170
+ class BM25Index(InvertedIndex):
171
+
172
+ @staticmethod
173
+ def tokenize(text: str) -> List[str]:
174
+ return simple_tokenize(text)
175
+
176
+ @staticmethod
177
+ def cache_term_weights(
178
+ posting_lists: List[PostingList],
179
+ total_docs: int,
180
+ avgdl: float,
181
+ dfs: List[int],
182
+ dls: List[int],
183
+ k1: float,
184
+ b: float,
185
+ ) -> None:
186
+ """Compute term weights and caching"""
187
+
188
+ N = total_docs
189
+ for tid, posting_list in enumerate(
190
+ tqdm.tqdm(posting_lists, desc="Regularizing TFs")
191
+ ):
192
+ idf = BM25Index.calc_idf(df=dfs[tid], N=N)
193
+ for i in range(len(posting_list.docid_postings)):
194
+ docid = posting_list.docid_postings[i]
195
+ tf = posting_list.tweight_postings[i]
196
+ dl = dls[docid]
197
+ regularized_tf = BM25Index.calc_regularized_tf(
198
+ tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
199
+ )
200
+ posting_list.tweight_postings[i] = regularized_tf * idf
201
+
202
+ @staticmethod
203
+ def calc_regularized_tf(
204
+ tf: int, dl: float, avgdl: float, k1: float, b: float
205
+ ) -> float:
206
+ return tf / (tf + k1 * (1 - b + b * dl / avgdl))
207
+
208
+ @staticmethod
209
+ def calc_idf(df: int, N: int):
210
+ return math.log(1 + (N - df + 0.5) / (df + 0.5))
211
+
212
+ @classmethod
213
+ def build_from_documents(
214
+ cls: Type[BM25Index],
215
+ documents: Iterable[Document],
216
+ store_raw: bool = True,
217
+ output_dir: Optional[str] = None,
218
+ ndocs: Optional[int] = None,
219
+ show_progress_bar: bool = True,
220
+ k1: float = 0.9,
221
+ b: float = 0.4,
222
+ ) -> BM25Index:
223
+ # Counting TFs, DFs, doc_lengths, etc.:
224
+ counting = run_counting(
225
+ documents=documents,
226
+ tokenize_fn=BM25Index.tokenize,
227
+ store_raw=store_raw,
228
+ ndocs=ndocs,
229
+ show_progress_bar=show_progress_bar,
230
+ )
231
+
232
+ # Compute term weights and caching:
233
+ posting_lists = counting.posting_lists
234
+ total_docs = len(counting.cid2docid)
235
+ BM25Index.cache_term_weights(
236
+ posting_lists=posting_lists,
237
+ total_docs=total_docs,
238
+ avgdl=counting.avgdl,
239
+ dfs=counting.dfs,
240
+ dls=counting.dls,
241
+ k1=k1,
242
+ b=b,
243
+ )
244
+
245
+ # Assembly and save:
246
+ index = BM25Index(
247
+ posting_lists=posting_lists,
248
+ vocab=counting.vocab,
249
+ cid2docid=counting.cid2docid,
250
+ collection_ids=counting.collection_ids,
251
+ doc_texts=counting.doc_texts,
252
+ )
253
+ return index
254
+
255
+ bm25_index = BM25Index.build_from_documents(
256
+ documents=iter(sciq.corpus),
257
+ ndocs=12160,
258
+ show_progress_bar=True,
259
+ )
260
+ bm25_index.save("output/bm25_index")
261
+ !ls
262
+
263
+ """### BM25 Retriever"""
264
+
265
+ from nlp4web_codebase.ir.models import BaseRetriever
266
+ from typing import Type
267
+ from abc import abstractmethod
268
+
269
+
270
+ class BaseInvertedIndexRetriever(BaseRetriever):
271
+
272
+ @property
273
+ @abstractmethod
274
+ def index_class(self) -> Type[InvertedIndex]:
275
+ pass
276
+
277
+ def __init__(self, index_dir: str) -> None:
278
+ self.index = self.index_class.from_saved(index_dir)
279
+
280
+ def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
281
+ toks = self.index.tokenize(query)
282
+ target_docid = self.index.cid2docid[cid]
283
+ term_weights = {}
284
+ for tok in toks:
285
+ if tok not in self.index.vocab:
286
+ continue
287
+ tid = self.index.vocab[tok]
288
+ posting_list = self.index.posting_lists[tid]
289
+ for docid, tweight in zip(
290
+ posting_list.docid_postings, posting_list.tweight_postings
291
+ ):
292
+ if docid == target_docid:
293
+ term_weights[tok] = tweight
294
+ break
295
+ return term_weights
296
+
297
+ def score(self, query: str, cid: str) -> float:
298
+ return sum(self.get_term_weights(query=query, cid=cid).values())
299
+
300
+ def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
301
+ toks = self.index.tokenize(query)
302
+ docid2score: Dict[int, float] = {}
303
+ for tok in toks:
304
+ if tok not in self.index.vocab:
305
+ continue
306
+ tid = self.index.vocab[tok]
307
+ posting_list = self.index.posting_lists[tid]
308
+ for docid, tweight in zip(
309
+ posting_list.docid_postings, posting_list.tweight_postings
310
+ ):
311
+ docid2score.setdefault(docid, 0)
312
+ docid2score[docid] += tweight
313
+ docid2score = dict(
314
+ sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
315
+ )
316
+ return {
317
+ self.index.collection_ids[docid]: score
318
+ for docid, score in docid2score.items()
319
+ }
320
+
321
+
322
+ class BM25Retriever(BaseInvertedIndexRetriever):
323
+
324
+ @property
325
+ def index_class(self) -> Type[BM25Index]:
326
+ return BM25Index
327
+
328
+ bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
329
+ bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
330
+
331
+ """# TASK1: tune b and k1 (4 points)
332
+
333
+ Tune b and k1 on the **dev** split of SciQ using the metric MAP@10. The evaluation function (`evalaute_map`) is provided. Record the values in `plots_k1` and `plots_b`. Do it in a greedy manner: as the influence from b is larger, please first tune b (with k1 fixed to the default value 0.9) and use the best value of b to further tune k1.
334
+
335
+ $${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$
336
+ """
337
+
338
+ from nlp4web_codebase.ir.data_loaders import Split
339
+ import pytrec_eval
340
+ import numpy as np
341
+
342
+ def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
343
+ metric = "map_cut_10"
344
+ qrels = sciq.get_qrels_dict(split)
345
+ evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
346
+ qps = evaluator.evaluate(rankings)
347
+ return float(np.mean([qp[metric] for qp in qps.values()]))
348
+
349
+ """Example of using the pre-requisite code:"""
350
+
351
+ # Loading dataset:
352
+ from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
353
+ sciq = load_sciq()
354
+ counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
355
+
356
+ # Building BM25 index and save:
357
+ bm25_index = BM25Index.build_from_documents(
358
+ documents=iter(sciq.corpus),
359
+ ndocs=12160,
360
+ show_progress_bar=True
361
+ )
362
+ bm25_index.save("output/bm25_index")
363
+
364
+ # Loading index and use BM25 retriever to retrieve:
365
+ bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
366
+ print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")) # the ranking
367
+
368
+ plots_b: Dict[str, List[float]] = {
369
+ "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
370
+ "Y": []
371
+ }
372
+ plots_k1: Dict[str, List[float]] = {
373
+ "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
374
+ "Y": []
375
+ }
376
+
377
+ ## YOUR_CODE_STARTS_HERE
378
+ # Two steps should be involved:
379
+ # Step 1. Fix k1 value to the default one 0.9,
380
+ # go through all the candidate b values (0, 0.1, ..., 1.0),
381
+ # and record in plots_b["Y"] the corresponding performances obtained via evaluate_map;
382
+ # Step 2. Fix b to the best one in step 1. and do the same for k1.
383
+
384
+ # Hint (on using the pre-requisite code):
385
+ # - One can use the loaded sciq dataset directly (loaded in the pre-requisite code);
386
+ # - One can build bm25_index with `BM25Index.build_from_documents`;
387
+ # - One can use BM25Retriever to load the index and perform retrieval on the dev queries
388
+ # (dev queries can be obtained via sciq.get_split_queries(Split.dev))
389
+
390
+ k1_fixed = 0.9
391
+ best_b = 0.0
392
+ best_map = 0.0
393
+
394
+ # fine-tuning b
395
+ for b in plots_b["X"]:
396
+ #Build
397
+ bm25_index = BM25Index.build_from_documents(
398
+ documents=iter(sciq.corpus),
399
+ ndocs=len(sciq.corpus),
400
+ b=b,
401
+ k1=k1_fixed,
402
+ show_progress_bar=True
403
+ )
404
+
405
+ bm25_index.save("output/bm25_index")
406
+ bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
407
+
408
+
409
+ #Retrieval
410
+ dev_queries = sciq.get_split_queries(Split.dev)
411
+ rankings = {}
412
+ print(dev_queries)
413
+ for query in dev_queries:
414
+ qid = query.query_id
415
+ rankings[qid] = bm25_retriever.retrieve(query.text)
416
+
417
+ # Evaluation
418
+ current_map = evaluate_map(rankings, split=Split.dev)
419
+ plots_b["Y"].append(current_map)
420
+
421
+ if current_map > best_map:
422
+ best_map = current_map
423
+ best_b = b
424
+
425
+ print(f"Best b value: {best_b} with MAP@10: {best_map}")
426
+
427
+ # fine-tuning k1
428
+ for k1 in plots_k1["X"]:
429
+ # Build
430
+ bm25_index = BM25Index.build_from_documents(
431
+ documents=iter(sciq.corpus),
432
+ ndocs=len(sciq.corpus),
433
+ b=best_b,
434
+ k1=k1,
435
+ show_progress_bar=True
436
+ )
437
+ bm25_index.save("output/bm25_index")
438
+ bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
439
+
440
+ # Retrieval
441
+ dev_queries = sciq.get_split_queries(Split.dev)
442
+ rankings = {}
443
+ for query in dev_queries:
444
+ rankings[query.query_id] = bm25_retriever.retrieve(query.text)
445
+
446
+ # Evaluation
447
+ current_map = evaluate_map(rankings, split=Split.dev)
448
+ plots_k1["Y"].append(current_map)
449
+
450
+ best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
451
+ best_k1_map = max(plots_k1["Y"])
452
+
453
+ ## YOU_CODE_ENDS_HERE
454
+
455
+ ## TEST_CASES (should be close to 0.8135637188208616 and 0.7512916099773244)
456
+ print(plots_k1["Y"][9])
457
+ print(plots_b["Y"][1])
458
+
459
+ ## RESULT_CHECKING_POINT
460
+ print(plots_k1)
461
+ print(plots_b)
462
+
463
+ from matplotlib import pyplot as plt
464
+ plt.plot(plots_b["X"], plots_b["Y"], label="b")
465
+ plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
466
+ plt.ylabel("MAP")
467
+ plt.legend()
468
+ plt.grid()
469
+ plt.show()
470
+
471
+ """Let's check the effectiveness gain on test after this tuning on dev"""
472
+
473
+ default_map = 0.7849
474
+ best_b = plots_b["X"][np.argmax(plots_b["Y"])]
475
+ best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
476
+ bm25_index = BM25Index.build_from_documents(
477
+ documents=iter(sciq.corpus),
478
+ ndocs=12160,
479
+ show_progress_bar=True,
480
+ k1=best_k1,
481
+ b=best_b
482
+ )
483
+ bm25_index.save("output/bm25_index")
484
+ bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
485
+ rankings = {}
486
+ for query in sciq.get_split_queries(Split.test): # note this is now on test
487
+ ranking = bm25_retriever.retrieve(query=query.text)
488
+ rankings[query.query_id] = ranking
489
+ optimized_map = evaluate_map(rankings, split=Split.test) # note this is now on test
490
+ print(default_map, optimized_map)
491
+
492
+ """# TASK2: CSC matrix and `CSCBM25Index` (12 points)
493
+
494
+ Recall that we use Python lists to implement posting lists, mapping term IDs to the documents in which they appear. This is inefficient due to its naive design. Actually [Compressed Sparse Column matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html) is very suitable for storing the posting lists and can boost the efficiency.
495
+
496
+ ## TASK2.1: learn about `scipy.sparse.csc_matrix` (2 point)
497
+
498
+ Convert the matrix \begin{bmatrix}
499
+ 0 & 1 & 0 & 3 \\
500
+ 10 & 2 & 1 & 0 \\
501
+ 0 & 0 & 0 & 9
502
+ \end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`.
503
+ """
504
+
505
+ from scipy.sparse._csc import csc_matrix
506
+ input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]]
507
+ data = None
508
+ indices = None
509
+ indptr = None
510
+ shape = None
511
+ ## YOUR_CODE_STARTS_HERE
512
+ # Please assign the values to data, indices, indptr and shape
513
+ # One can just do it in a hard-coded manner
514
+
515
+ data = [10, 1, 2, 1, 3, 9]
516
+ indices = [1, 0, 1, 1, 0, 2]
517
+ indptr = [0, 1, 3, 4, 6]
518
+ shape = (3, 4)
519
+
520
+ ## YOUR_CODE_ENDS_HERE
521
+ output_matrix = csc_matrix((data, indices, indptr), shape=shape)
522
+
523
+ ## TEST_CASES (should be 3 and 11)
524
+ print((output_matrix.indices + output_matrix.data).tolist()[2])
525
+ print((output_matrix.indices + output_matrix.data).tolist()[-1])
526
+
527
+ ## RESULT_CHECKING_POINT
528
+ print((output_matrix.indices + output_matrix.data).tolist())
529
+
530
+ """## TASK2.2: implement `CSCBM25Index` (4 points)
531
+
532
+ Implement `CSCBM25Index` by completing the missing code. Note that `CSCInvertedIndex` is similar to `InvertedIndex` which we talked about during the class. The main difference is posting lists are represented by a CSC sparse matrix.
533
+ """
534
+
535
+ @dataclass
536
+ class CSCInvertedIndex:
537
+ posting_lists_matrix: csc_matrix # docid -> posting_list
538
+ vocab: Dict[str, int]
539
+ cid2docid: Dict[str, int] # collection_id -> docid
540
+ collection_ids: List[str] # docid -> collection_id
541
+ doc_texts: Optional[List[str]] = None # docid -> document text
542
+
543
+ def save(self, output_dir: str) -> None:
544
+ os.makedirs(output_dir, exist_ok=True)
545
+ with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
546
+ pickle.dump(self, f)
547
+
548
+ @classmethod
549
+ def from_saved(cls: Type[T], saved_dir: str) -> T:
550
+ index = cls(
551
+ posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
552
+ )
553
+ with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
554
+ index = pickle.load(f)
555
+ return index
556
+
557
+ @dataclass
558
+ class CSCBM25Index(CSCInvertedIndex):
559
+
560
+ @staticmethod
561
+ def tokenize(text: str) -> List[str]:
562
+ return simple_tokenize(text)
563
+
564
+ @staticmethod
565
+ def cache_term_weights(
566
+ posting_lists: List[PostingList],
567
+ total_docs: int,
568
+ avgdl: float,
569
+ dfs: List[int],
570
+ dls: List[int],
571
+ k1: float,
572
+ b: float,
573
+ ) -> csc_matrix:
574
+ """Compute term weights and caching"""
575
+
576
+ ## YOUR_CODE_STARTS_HERE
577
+ data = []
578
+ row_idx = []
579
+ col_idx = []
580
+
581
+ for term_idx, posting_list in enumerate(posting_lists):
582
+ df = dfs[term_idx]
583
+ idf = CSCBM25Index.calc_idf(df, total_docs)
584
+
585
+
586
+ doc_ids = posting_list.docid_postings
587
+ term_freqs = posting_list.tweight_postings
588
+
589
+ for doc_id, tf in zip(doc_ids, term_freqs):
590
+ dl = dls[doc_id]
591
+ regularized_tf = CSCBM25Index.calc_regularized_tf(tf, dl, avgdl, k1, b)
592
+ weight = idf * regularized_tf
593
+
594
+ data.append(weight)
595
+ row_idx.append(doc_id)
596
+ col_idx.append(term_idx)
597
+
598
+ term_weights = csc_matrix((np.array(data, dtype=np.float32), (row_idx, col_idx)), shape=(total_docs, len(posting_lists)))
599
+
600
+ return term_weights
601
+ ## YOUR_CODE_ENDS_HERE
602
+
603
+
604
+ @staticmethod
605
+ def calc_regularized_tf(
606
+ tf: int, dl: float, avgdl: float, k1: float, b: float
607
+ ) -> float:
608
+ return tf / (tf + k1 * (1 - b + b * dl / avgdl))
609
+
610
+ @staticmethod
611
+ def calc_idf(df: int, N: int):
612
+ return math.log(1 + (N - df + 0.5) / (df + 0.5))
613
+
614
+ @classmethod
615
+ def build_from_documents(
616
+ cls: Type[CSCBM25Index],
617
+ documents: Iterable[Document],
618
+ store_raw: bool = True,
619
+ output_dir: Optional[str] = None,
620
+ ndocs: Optional[int] = None,
621
+ show_progress_bar: bool = True,
622
+ k1: float = 0.9,
623
+ b: float = 0.4,
624
+ ) -> CSCBM25Index:
625
+ # Counting TFs, DFs, doc_lengths, etc.:
626
+ counting = run_counting(
627
+ documents=documents,
628
+ tokenize_fn=CSCBM25Index.tokenize,
629
+ store_raw=store_raw,
630
+ ndocs=ndocs,
631
+ show_progress_bar=show_progress_bar,
632
+ )
633
+
634
+ # Compute term weights and caching:
635
+ posting_lists = counting.posting_lists
636
+ total_docs = len(counting.cid2docid)
637
+ posting_lists_matrix = CSCBM25Index.cache_term_weights(
638
+ posting_lists=posting_lists,
639
+ total_docs=total_docs,
640
+ avgdl=counting.avgdl,
641
+ dfs=counting.dfs,
642
+ dls=counting.dls,
643
+ k1=k1,
644
+ b=b,
645
+ )
646
+
647
+ # Assembly and save:
648
+ index = CSCBM25Index(
649
+ posting_lists_matrix=posting_lists_matrix,
650
+ vocab=counting.vocab,
651
+ cid2docid=counting.cid2docid,
652
+ collection_ids=counting.collection_ids,
653
+ doc_texts=counting.doc_texts,
654
+ )
655
+ return index
656
+
657
+ csc_bm25_index = CSCBM25Index.build_from_documents(
658
+ documents=iter(sciq.corpus),
659
+ ndocs=12160,
660
+ show_progress_bar=True,
661
+ k1=best_k1,
662
+ b=best_b
663
+ )
664
+ csc_bm25_index.save("output/csc_bm25_index")
665
+
666
+ ## TEST_CASES (should be 7 and 95)
667
+ print(len(str(os.path.getsize("output/csc_bm25_index/index.pkl"))))
668
+ print(os.path.getsize("output/csc_bm25_index/index.pkl") // int(1e5))
669
+
670
+ ## RESULT_CHECKING_POINT
671
+ print(os.path.getsize("output/csc_bm25_index/index.pkl"))
672
+
673
+ """We can compare the size of the CSC-based index with the Python-list-based index:"""
674
+
675
+ print(os.path.getsize("output/bm25_index/index.pkl"))
676
+
677
+ """## TASK2.3: implement `CSCInvertedIndexRetriever` (6 points)
678
+
679
+ Implement `CSCInvertedIndexRetriever` by completing the missing code.
680
+ """
681
+
682
+ class BaseCSCInvertedIndexRetriever(BaseRetriever):
683
+ @property
684
+ @abstractmethod
685
+ def index_class(self) -> Type[CSCInvertedIndex]:
686
+ pass
687
+ def __init__(self, index_dir: str) -> None:
688
+ self.index = self.index_class.from_saved(index_dir)
689
+
690
+ def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
691
+ ## YOUR_CODE_STARTS_HERE
692
+ doc_id = self.index.cid2docid[cid] # Map collection ID to document ID
693
+ vocab = self.index.vocab
694
+ term_weights = {}
695
+
696
+ # Tokenize the query
697
+ query_terms = self.index.tokenize(query)
698
+
699
+ for term in query_terms:
700
+ if term in vocab:
701
+ term_idx = vocab[term]
702
+ start, end = self.index.posting_lists_matrix.indptr[term_idx], self.index.posting_lists_matrix.indptr[term_idx + 1]
703
+ doc_ids = self.index.posting_lists_matrix.indices[start:end]
704
+ weights = self.index.posting_lists_matrix.data[start:end]
705
+
706
+ # Check if the term appears in the document
707
+ if doc_id in doc_ids:
708
+ term_weights[term] = weights[list(doc_ids).index(doc_id)]
709
+
710
+ return term_weights
711
+ ## YOUR_CODE_ENDS_HERE
712
+ def score(self, query: str, cid: str) -> float:
713
+ return sum(self.get_term_weights(query=query, cid=cid).values())
714
+
715
+ def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
716
+ ## YOUR_CODE_STARTS_HERE
717
+ query_terms = self.index.tokenize(query)
718
+ vocab = self.index.vocab
719
+ scores = {}
720
+
721
+ for term in query_terms:
722
+ if term in vocab:
723
+ term_idx = vocab[term]
724
+ start, end = self.index.posting_lists_matrix.indptr[term_idx], self.index.posting_lists_matrix.indptr[term_idx + 1]
725
+ doc_ids = self.index.posting_lists_matrix.indices[start:end]
726
+ weights = self.index.posting_lists_matrix.data[start:end]
727
+
728
+ for doc_id, weight in zip(doc_ids, weights):
729
+ scores[doc_id] = scores.get(doc_id, 0.0) + weight
730
+
731
+ scored_docs = {self.index.collection_ids[doc_id]: score for doc_id, score in scores.items()}
732
+
733
+ return dict(sorted(scored_docs.items(), key=lambda item: item[1], reverse=True)[:topk])
734
+ ## YOUR_CODE_ENDS_HERE
735
+
736
+ class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
737
+ @property
738
+ def index_class(self) -> Type[CSCBM25Index]:
739
+ return CSCBM25Index
740
+
741
+ ## TEST_CASES (should be close to
742
+ # {'theory': 3.1838157176971436, 'evolution': 3.488086223602295, 'natural': 2.629807710647583, 'selection': 3.552377462387085}
743
+ # {'train-11632': 16.241527557373047, 'train-10931': 13.352127075195312, 'train-2006': 12.854086875915527, 'train-7040': 12.690572738647461, 'train-1719': 11.01913833618164, 'train-9875': 10.886155128479004, 'train-1971': 10.796306610107422, 'train-9882': 10.535819053649902, 'train-2018': 10.481085777282715, 'test-586': 10.478515625}
744
+ #)
745
+ csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
746
+ query = "Who proposed the theory of evolution by natural selection?"
747
+ print(csc_bm25_retriever.get_term_weights(query=query, cid="train-2006"))
748
+ print(csc_bm25_retriever.retrieve(query))
749
+
750
+ ## RESULT_CHECKING_POINT
751
+ csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
752
+ query = "What are the differences between immunodeficiency and autoimmune diseases?"
753
+ print(csc_bm25_retriever.get_term_weights(query=query, cid="train-1691"))
754
+ print(csc_bm25_retriever.retrieve("What are the differences between immunodeficiency and autoimmune diseases?"))
755
+
756
+ """# TASK3: a search-engine demo based on Huggingface space (4 points)
757
+
758
+ ## TASK3.1: create the gradio app (2 point)
759
+
760
+ Create a gradio app to demo the BM25 search engine index on SciQ. The app should have a single input variable for the query (of type `str`) and a single output variable for the returned ranking (of type `List[Hit]` in the code below). Please use the BM25 system with default k1 and b values.
761
+
762
+ Hint: it should use a "search" function of signature:
763
+
764
+ ```python
765
+ def search(query: str) -> List[Hit]:
766
+ ...
767
+ ```
768
+ """
769
+
770
+ !pip install gradio
771
+
772
+ import gradio as gr
773
+ from typing import TypedDict
774
+
775
+ class Hit(TypedDict):
776
+ cid: str
777
+ score: float
778
+ text: str
779
+
780
+ demo: Optional[gr.Interface] = None # Assign your gradio demo to this variable
781
+ return_type = List[Hit]
782
+
783
+ ## YOUR_CODE_STARTS_HERE
784
+
785
+ bm25_index = BM25Index.build_from_documents(
786
+ documents=iter(sciq.corpus),
787
+ ndocs=12160,
788
+ show_progress_bar=True
789
+ )
790
+
791
+ def search(query: str) -> List[Hit]:
792
+ """
793
+ Search function that performs BM25 search on the SciQ dataset.
794
+
795
+ Args:
796
+ query: Search query string
797
+
798
+ Returns:
799
+ List[Hit]: List of search results in Hit format
800
+ """
801
+
802
+ results = bm25_retriever.retrieve(query)
803
+ hits: List[Hit] = []
804
+ for cid, score in results.items():
805
+
806
+ docid = bm25_retriever.index.cid2docid[cid]
807
+ hit: Hit = Hit(
808
+ cid=cid,
809
+ score=float(score),
810
+ text=bm25_retriever.index.doc_texts[docid] if bm25_retriever.index.doc_texts else ""
811
+ )
812
+ hits.append(hit)
813
+
814
+ return hits
815
+
816
+ demo = gr.Interface(
817
+ fn=search,
818
+ inputs="text",
819
+ outputs="text",
820
+ )
821
+ ## YOUR_CODE_ENDS_HERE
822
+ demo.launch()
823
+
824
+ ## TEST_CASES (result should be [{'cid': 'train-10966', 'score': 12.417802868109781, 'text': 'Bacteria can be used to make cheese from milk. The bacteria turn the milk sugars into lactic acid. The acid is what causes the milk to curdle to form cheese. Bacteria are also involved in producing other foods. Yogurt is made by using bacteria to ferment milk ( Figure below ). Fermenting cabbage with bacteria produces sauerkraut.'}, {'cid': 'train-0', 'score': 10.702840907292215, 'text': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}, {'cid': 'dev-569', 'score': 9.78520518303728, 'text': 'A wide range of friendly bacteria live in the gut. Bacteria begin to populate the human digestive system right after birth. Gut bacteria include Lactobacillus , the bacteria commonly used in probiotic foods such as yogurt, and E. coli bacteria. About a third of all bacteria in the gut are members of the Bacteroides species. Bacteroides are key in helping us digest plant food.'}, {'cid': 'train-1133', 'score': 8.292180216871554, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement.'}, {'cid': 'train-5314', 'score': 8.211635318028303, 'text': 'Bacteria are often used to make cheese from milk. But making foods is not the only beneficial role of bacteria. For example, they also play an essential role in your gut!.'}, {'cid': 'train-6684', 'score': 8.168255107424818, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement. Other sources of calcium are pictured in the Figure below .'}, {'cid': 'train-7890', 'score': 7.930578384187305, 'text': 'Animals and some bacteria and fungi carry out lactic acid fermentation. Lactic acid is a waste product of this process. Our muscles perform lactic acid fermentation during strenuous exercise, since oxygen cannot be delivered to the muscles quickly enough. The buildup of lactic acid is believed to make your muscles sore after exercise. Bacteria that produce lactic acid are used to make cheese and yogurt. The lactic acid causes the proteins in milk to thicken. Lactic acid also causes tooth decay, because bacteria use the sugars in your mouth for energy.'}, {'cid': 'train-6916', 'score': 7.833677059320589, 'text': 'Yogurt is a good source of calcium. Yogurt also contains active cultures of "good" bacteria. Foods that contain these beneficial bacteria are sometimes called "probiotic. ".'}, {'cid': 'train-10029', 'score': 7.725028405457634, 'text': 'Humans have collected and grown mushrooms for food for thousands of years. Figure below shows some of the many types of mushrooms that people eat. Yeasts are used in bread baking and brewing alcoholic beverages. Other fungi are used in fermenting a wide variety of foods, including soy sauce, tempeh, and cheeses. Blue cheese has its distinctive appearance and flavor because of the fungus growing though it (see Figure below ).'}, {'cid': 'train-10983', 'score': 7.334055808872751, 'text': "No doubt you've had a sore throat before, and you've probably eaten cheese or yogurt. If so, then you've already encountered the amazing world of prokaryotes. Prokaryotes are single-celled organisms that lack a nucleus. They also lack other membrane-bound organelles. Prokaryotes are tiny. They can only be viewed with a microscope (see Figure below ). But they are the most numerous organisms on Earth. Without them, the world would be a very different place."}])
825
+ import requests
826
+ import json
827
+
828
+ headers = {"Content-Type": "application/json"}
829
+ data = {"data": ["What type of organism is commonly used in preparation of foods such as cheese and yogurt?"]}
830
+ response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
831
+ event_id = response.json()["event_id"]
832
+ response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
833
+ lines = list(response.iter_lines())
834
+ print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
835
+
836
+ ## RESULT_CHECKING_POINT
837
+ import requests
838
+ import json
839
+
840
+ headers = {"Content-Type": "application/json"}
841
+ data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
842
+ response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
843
+ event_id = response.json()["event_id"]
844
+ response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
845
+ lines = list(response.iter_lines())
846
+ print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
847
+
848
+ """## TASK3.2: upload it to Huggingface Space (2 point)
849
+
850
+ Upload your gradio app to Huggingface Space. Put your URL to the Space app in the variable `hf_space_url`.
851
+
852
+ IMPORTANT!!! You can get this URL from:
853
+
854
+ *Your Space page* -> *"three dots" on the top right* -> "embedd this space" -> "Direct URL"
855
+
856
+ An example URL (not for our task) is: https://stabilityai-stable-diffusion-3-5-large.hf.space (from https://huggingface.co/spaces/stabilityai/stable-diffusion-3.5-large)
857
+ """
858
+
859
+ hf_space_url: Optional[str] = "https://dogukan-bg-nlp4webspace.hf.space" # Store your created Huggingface Space URL in this variable
860
+ ## YOUR_CODE_STARTS_HERE
861
+
862
+ ## YOUR_CODE_ENDS_HERE
863
+
864
+ ## RESULT_CHECKING_POINT
865
+ import requests
866
+ import json
867
+
868
+ print(hf_space_url)
869
+ headers = {"Content-Type": "application/json"}
870
+ data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
871
+ response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
872
+ event_id = response.json()["event_id"]
873
+ response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
874
+ lines = list(response.iter_lines())
875
+ print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
876
+
877
+ ## TEST_CASES (result should be [{'cid': 'train-5587', 'score': 26.74537329473182, 'text': 'The entropy change is positive as the solid state changes into the liquid state. If the transition went from the liquid to the solid state, the numerical value for would be the same, but the sign would be reversed since we are going from a less ordered to a more ordered situation.'}, {'cid': 'train-2', 'score': 25.93532475963942, 'text': 'Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization.'}, {'cid': 'train-1658', 'score': 19.0263955721366, 'text': 'There are many examples in the chemical world of changes in entropy. Phase transitions are one obvious example. When a substance makes a transition from the liquid state to the gaseous state, the particles have many more possible arrangements, because they are no longer confined to a specified volume in which they are close to each other; gas particles can move freely throughout their container. Vaporization represents an increase in entropy. In the opposite direction, a liquid loses entropy when it freezes to a solid. Because solids have very ordered structures, there are fewer possible arrangements of particles that would result in the properties associated with a solid.'}, {'cid': 'train-5603', 'score': 16.14918704233498, 'text': 'Chemical energy, the energy stored in molecules and atoms, is one type of potential energy. Certain reactions can cause this energy to be released as heat. Other reactions require an input of energy, in which case the products will store more potential energy than the reactants. When we studied phase changes, we saw a relationship between energy and the state of matter. To melt a solid or boil a liquid, energy needs to be added in order to break up the intermolecular forces holding particles together in more ordered states. The reverse processes, condensation and freezing, release energy, because more favorable intermolecular interactions are formed.'}, {'cid': 'train-8144', 'score': 13.369317026860408, 'text': 'Solid carbon dioxide is also called dry ice. That’s because when it gets warmer and changes state, it doesn’t change to a liquid by melting. Instead, it changes directly to a gas without going through the liquid state. The process in which a solid changes directly to a gas is called sublimation . It occurs when energy is added to a solid such as dry ice. You can watch dry ice changing directly to a gas in the video at this URL: http://www. youtube. com/watch?v=J8mDGwf-5x0 .'}, {'cid': 'train-844', 'score': 12.931270408607555, 'text': 'The water droplets of fog form from water vapor in the air. Fog disappears when the water droplets change back to water vapor. These changes are examples of changes of state. A change of state occurs whenever matter changes from one state to another. Common states of matter on Earth are solid, liquid, and gas. Matter may change back and forth between any two of these states.'}, {'cid': 'train-9811', 'score': 12.904636038613848, 'text': 'Start right above point on the temperature axis and follow the red line vertically. At very low pressure, the particles of the substance are far apart from one another and the substance is in the gas state. As the pressure is increased, the particles of the substance are forced closer and closer together. Eventually the particles are pushed so close together that attractive forces cause the substance to condense into the liquid state. Continually increasing the pressure on the liquid will eventually cause the substance to solidify. For the majority of substances, the solid state is denser than the liquid state and so putting a liquid under great pressure will cause it to turn into a solid. The line segment represents the process of sublimation, where the substance changes directly from a solid to a gas. At a sufficiently low pressure, the liquid phase does not exist. The point labeled is called the triple point . The triple point is the one condition of temperature and pressure where the solid, liquid, and vapor states of a substance can all coexist at equilibrium.'}, {'cid': 'train-8260', 'score': 12.876342252900347, 'text': 'Unlike a crystalline solid, an amorphous solid is a solid that lacks an ordered internal structure. Some examples of amorphous solids include rubber, plastic, and gels. Glass is a very important amorphous solid that is made by cooling a mixture of materials in such a way that it does not crystallize. Glass is sometimes referred to as a supercooled liquid rather than a solid. If you have ever watched a glassblower in action, you have noticed that he takes advantage of the fact that amorphous solids do not have a distinct melting point like crystalline solids do. Instead, as glass is heated, it slowly softens and can be shaped into all sorts of interesting forms. When a glass object shatters, it does so in a very irregular way, unlike crystalline solids, which always break into fragments that have the same shape as dictated by its crystal system.'}, {'cid': 'train-317', 'score': 12.82403749702155, 'text': 'An amorphous solid is a solid that lacks an ordered internal structure.'}, {'cid': 'train-6203', 'score': 12.76684203292532, 'text': 'Matter can exist in one of several different states, including a gas, liquid, or solid state. States of matter differ in the amount of energy their molecules have. When matter recycles, it changes state by gaining or losing energy.'}]
878
+ import requests
879
+ import json
880
+
881
+ headers = {"Content-Type": "application/json"}
882
+ data = {"data": ["Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what?"]}
883
+ response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
884
+ event_id = response.json()["event_id"]
885
+ response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
886
+ lines = list(response.iter_lines())
887
+ print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))
888
+
nlp4web-codebase/.gitignore ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ *.tsv
132
+ *.jsonl
133
+ *.zip
134
+ output/
nlp4web-codebase/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # nlp4web
2
+ Codebase of teaching materials for NLP4Web.
nlp4web-codebase/nlp4web_codebase/__init__.py ADDED
File without changes
nlp4web-codebase/nlp4web_codebase/ir/__init__.py ADDED
File without changes
nlp4web-codebase/nlp4web_codebase/ir/analysis.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Optional, Protocol
3
+ import pandas as pd
4
+ import tqdm
5
+ import ujson
6
+ from nlp4web_codebase.ir.data_loaders import IRDataset
7
+
8
+
9
+ def round_dict(obj: Dict[str, float], ndigits: int = 4) -> Dict[str, float]:
10
+ return {k: round(v, ndigits=ndigits) for k, v in obj.items()}
11
+
12
+
13
+ def sort_dict(obj: Dict[str, float], reverse: bool = True) -> Dict[str, float]:
14
+ return dict(sorted(obj.items(), key=lambda pair: pair[1], reverse=reverse))
15
+
16
+
17
+ def save_ranking_results(
18
+ output_dir: str,
19
+ query_ids: List[str],
20
+ rankings: List[Dict[str, float]],
21
+ query_performances_lists: List[Dict[str, float]],
22
+ cid2tweights_lists: Optional[List[Dict[str, Dict[str, float]]]] = None,
23
+ ):
24
+ os.makedirs(output_dir, exist_ok=True)
25
+ output_path = os.path.join(output_dir, "ranking_results.jsonl")
26
+ rows = []
27
+ for i, (query_id, ranking, query_performances) in enumerate(
28
+ zip(query_ids, rankings, query_performances_lists)
29
+ ):
30
+ row = {
31
+ "query_id": query_id,
32
+ "ranking": round_dict(ranking),
33
+ "query_performances": round_dict(query_performances),
34
+ "cid2tweights": {},
35
+ }
36
+ if cid2tweights_lists is not None:
37
+ row["cid2tweights"] = {
38
+ cid: round_dict(tws) for cid, tws in cid2tweights_lists[i].items()
39
+ }
40
+ rows.append(row)
41
+ pd.DataFrame(rows).to_json(
42
+ output_path,
43
+ orient="records",
44
+ lines=True,
45
+ )
46
+
47
+
48
+ class TermWeightingFunction(Protocol):
49
+ def __call__(self, query: str, cid: str) -> Dict[str, float]: ...
50
+
51
+
52
+ def compare(
53
+ dataset: IRDataset,
54
+ results_path1: str,
55
+ results_path2: str,
56
+ output_dir: str,
57
+ main_metric: str = "recip_rank",
58
+ system1: Optional[str] = None,
59
+ system2: Optional[str] = None,
60
+ term_weighting_fn1: Optional[TermWeightingFunction] = None,
61
+ term_weighting_fn2: Optional[TermWeightingFunction] = None,
62
+ ) -> None:
63
+ os.makedirs(output_dir, exist_ok=True)
64
+ df1 = pd.read_json(results_path1, orient="records", lines=True)
65
+ df2 = pd.read_json(results_path2, orient="records", lines=True)
66
+ assert len(df1) == len(df2)
67
+ all_qrels = {}
68
+ for split in dataset.split2qrels:
69
+ all_qrels.update(dataset.get_qrels_dict(split))
70
+ qid2query = {query.query_id: query for query in dataset.queries}
71
+ cid2doc = {doc.collection_id: doc for doc in dataset.corpus}
72
+ diff_col = f"{main_metric}:qp1-qp2"
73
+ merged = pd.merge(df1, df2, on="query_id", how="outer")
74
+ rows = []
75
+ for _, example in tqdm.tqdm(merged.iterrows(), desc="Comparing", total=len(merged)):
76
+ docs = {cid: cid2doc[cid].text for cid in dict(example["ranking_x"])}
77
+ docs.update({cid: cid2doc[cid].text for cid in dict(example["ranking_y"])})
78
+ query_id = example["query_id"]
79
+ row = {
80
+ "query_id": query_id,
81
+ "query": qid2query[query_id].text,
82
+ diff_col: example["query_performances_x"][main_metric]
83
+ - example["query_performances_y"][main_metric],
84
+ "ranking1": ujson.dumps(example["ranking_x"], indent=4),
85
+ "ranking2": ujson.dumps(example["ranking_y"], indent=4),
86
+ "docs": ujson.dumps(docs, indent=4),
87
+ "query_performances1": ujson.dumps(
88
+ example["query_performances_x"], indent=4
89
+ ),
90
+ "query_performances2": ujson.dumps(
91
+ example["query_performances_y"], indent=4
92
+ ),
93
+ "qrels": ujson.dumps(all_qrels[query_id], indent=4),
94
+ }
95
+ if term_weighting_fn1 is not None and term_weighting_fn2 is not None:
96
+ all_cids = set(example["ranking_x"]) | set(example["ranking_y"])
97
+ cid2tweights1 = {}
98
+ cid2tweights2 = {}
99
+ ranking1 = {}
100
+ ranking2 = {}
101
+ for cid in all_cids:
102
+ tweights1 = term_weighting_fn1(query=qid2query[query_id].text, cid=cid)
103
+ tweights2 = term_weighting_fn2(query=qid2query[query_id].text, cid=cid)
104
+ ranking1[cid] = sum(tweights1.values())
105
+ ranking2[cid] = sum(tweights2.values())
106
+ cid2tweights1[cid] = tweights1
107
+ cid2tweights2[cid] = tweights2
108
+ ranking1 = sort_dict(ranking1)
109
+ ranking2 = sort_dict(ranking2)
110
+ row["ranking1"] = ujson.dumps(ranking1, indent=4)
111
+ row["ranking2"] = ujson.dumps(ranking2, indent=4)
112
+ cid2tweights1 = {cid: cid2tweights1[cid] for cid in ranking1}
113
+ cid2tweights2 = {cid: cid2tweights2[cid] for cid in ranking2}
114
+ row["cid2tweights1"] = ujson.dumps(cid2tweights1, indent=4)
115
+ row["cid2tweights2"] = ujson.dumps(cid2tweights2, indent=4)
116
+ rows.append(row)
117
+ table = pd.DataFrame(rows).sort_values(by=diff_col, ascending=False)
118
+ output_path = os.path.join(output_dir, f"compare-{system1}_vs_{system2}.tsv")
119
+ table.to_csv(output_path, sep="\t", index=False)
120
+
121
+
122
+ # if __name__ == "__main__":
123
+ # # python -m lecture2.bm25.analysis
124
+ # from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
125
+ # from lecture2.bm25.bm25_retriever import BM25Retriever
126
+ # from lecture2.bm25.tfidf_retriever import TFIDFRetriever
127
+ # import numpy as np
128
+
129
+ # sciq = load_sciq()
130
+ # system1 = "bm25"
131
+ # system2 = "tfidf"
132
+ # results_path1 = f"output/sciq-{system1}/results/ranking_results.jsonl"
133
+ # results_path2 = f"output/sciq-{system2}/results/ranking_results.jsonl"
134
+ # index_dir1 = f"output/sciq-{system1}"
135
+ # index_dir2 = f"output/sciq-{system2}"
136
+ # compare(
137
+ # dataset=sciq,
138
+ # results_path1=results_path1,
139
+ # results_path2=results_path2,
140
+ # output_dir=f"output/sciq-{system1}_vs_{system2}",
141
+ # system1=system1,
142
+ # system2=system2,
143
+ # term_weighting_fn1=BM25Retriever(index_dir1).get_term_weights,
144
+ # term_weighting_fn2=TFIDFRetriever(index_dir2).get_term_weights,
145
+ # )
146
+
147
+ # # bias on #shared_terms of TFIDF:
148
+ # df1 = pd.read_json(results_path1, orient="records", lines=True)
149
+ # df2 = pd.read_json(results_path2, orient="records", lines=True)
150
+ # merged = pd.merge(df1, df2, on="query_id", how="outer")
151
+ # nterms1 = []
152
+ # nterms2 = []
153
+ # for _, row in merged.iterrows():
154
+ # nterms1.append(len(list(dict(row["cid2tweights_x"]).values())[0]))
155
+ # nterms2.append(len(list(dict(row["cid2tweights_y"]).values())[0]))
156
+ # percentiles = (5, 25, 50, 75, 95)
157
+ # print(system1, np.percentile(nterms1, percentiles), np.mean(nterms1).round(2))
158
+ # print(system2, np.percentile(nterms2, percentiles), np.mean(nterms2).round(2))
159
+ # # bm25 [ 3. 4. 5. 7. 11.] 5.64
160
+ # # tfidf [1. 2. 3. 5. 9.] 3.58
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict, List
4
+ from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
5
+
6
+
7
+ class Split(str, Enum):
8
+ train = "train"
9
+ dev = "dev"
10
+ test = "test"
11
+
12
+
13
+ @dataclass
14
+ class IRDataset:
15
+ corpus: List[Document]
16
+ queries: List[Query]
17
+ split2qrels: Dict[Split, List[QRel]]
18
+
19
+ def get_stats(self) -> Dict[str, int]:
20
+ stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
21
+ for split, qrels in self.split2qrels.items():
22
+ stats[f"|qrels-{split}|"] = len(qrels)
23
+ return stats
24
+
25
+ def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
26
+ qrels_dict = {}
27
+ for qrel in self.split2qrels[split]:
28
+ qrels_dict.setdefault(qrel.query_id, {})
29
+ qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
30
+ return qrels_dict
31
+
32
+ def get_split_queries(self, split: Split) -> List[Query]:
33
+ qrels = self.split2qrels[split]
34
+ qids = {qrel.query_id for qrel in qrels}
35
+ return list(filter(lambda query: query.query_id in qids, self.queries))
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/dm.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class Document:
7
+ collection_id: str
8
+ text: str
9
+
10
+
11
+ @dataclass
12
+ class Query:
13
+ query_id: str
14
+ text: str
15
+
16
+
17
+ @dataclass
18
+ class QRel:
19
+ query_id: str
20
+ collection_id: str
21
+ relevance: int
22
+ answer: Optional[str] = None
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/sciq.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ from nlp4web_codebase.ir.data_loaders import IRDataset, Split
3
+ from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
4
+ from datasets import load_dataset
5
+ import joblib
6
+
7
+
8
+ @(joblib.Memory(".cache").cache)
9
+ def load_sciq(verbose: bool = False) -> IRDataset:
10
+ train = load_dataset("allenai/sciq", split="train")
11
+ validation = load_dataset("allenai/sciq", split="validation")
12
+ test = load_dataset("allenai/sciq", split="test")
13
+ data = {Split.train: train, Split.dev: validation, Split.test: test}
14
+
15
+ # Each duplicated record is the same to each other:
16
+ df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
17
+ for question, group in df.groupby("question"):
18
+ assert len(set(group["support"].tolist())) == len(group)
19
+ assert len(set(group["correct_answer"].tolist())) == len(group)
20
+
21
+ # Build:
22
+ corpus = []
23
+ queries = []
24
+ split2qrels: Dict[str, List[dict]] = {}
25
+ question2id = {}
26
+ support2id = {}
27
+ for split, rows in data.items():
28
+ if verbose:
29
+ print(f"|raw_{split}|", len(rows))
30
+ split2qrels[split] = []
31
+ for i, row in enumerate(rows):
32
+ example_id = f"{split}-{i}"
33
+ support: str = row["support"]
34
+ if len(support.strip()) == 0:
35
+ continue
36
+ question = row["question"]
37
+ if len(support.strip()) == 0:
38
+ continue
39
+ if support in support2id:
40
+ continue
41
+ else:
42
+ support2id[support] = example_id
43
+ if question in question2id:
44
+ continue
45
+ else:
46
+ question2id[question] = example_id
47
+ doc = {"collection_id": example_id, "text": support}
48
+ query = {"query_id": example_id, "text": row["question"]}
49
+ qrel = {
50
+ "query_id": example_id,
51
+ "collection_id": example_id,
52
+ "relevance": 1,
53
+ "answer": row["correct_answer"],
54
+ }
55
+ corpus.append(Document(**doc))
56
+ queries.append(Query(**query))
57
+ split2qrels[split].append(QRel(**qrel))
58
+
59
+ # Assembly and return:
60
+ return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ # python -m nlp4web_codebase.ir.data_loaders.sciq
65
+ import ujson
66
+ import time
67
+
68
+ start = time.time()
69
+ dataset = load_sciq(verbose=True)
70
+ print(f"Loading costs: {time.time() - start}s")
71
+ print(ujson.dumps(dataset.get_stats(), indent=4))
72
+ # ________________________________________________________________________________
73
+ # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq...
74
+ # load_sciq(verbose=True)
75
+ # |raw_train| 11679
76
+ # |raw_dev| 1000
77
+ # |raw_test| 1000
78
+ # ________________________________________________________load_sciq - 7.3s, 0.1min
79
+ # Loading costs: 7.260092735290527s
80
+ # {
81
+ # "|corpus|": 12160,
82
+ # "|queries|": 12160,
83
+ # "|qrels-train|": 10409,
84
+ # "|qrels-dev|": 875,
85
+ # "|qrels-test|": 876
86
+ # }
nlp4web-codebase/nlp4web_codebase/ir/models/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, Type
3
+
4
+
5
+ class BaseRetriever(ABC):
6
+
7
+ @property
8
+ @abstractmethod
9
+ def index_class(self) -> Type[Any]:
10
+ pass
11
+
12
+ def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
13
+ raise NotImplementedError
14
+
15
+ @abstractmethod
16
+ def score(self, query: str, cid: str) -> float:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
21
+ pass
nlp4web-codebase/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ .
nlp4web-codebase/setup.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+
4
+ with open("README.md", "r", encoding="utf-8") as fh:
5
+ readme = fh.read()
6
+
7
+ setup(
8
+ name="nlp4web-codebase",
9
+ version="0.0.0",
10
+ author="Kexin Wang",
11
+ author_email="[email protected]",
12
+ description="Codebase of teaching materials for NLP4Web.",
13
+ long_description=readme,
14
+ long_description_content_type="text/markdown",
15
+ url="https://https://github.com/kwang2049/nlp4web-codebase",
16
+ project_urls={
17
+ "Bug Tracker": "https://github.com/kwang2049/nlp4web-codebase/issues",
18
+ },
19
+ packages=find_packages(),
20
+ classifiers=[
21
+ "Programming Language :: Python :: 3",
22
+ "License :: OSI Approved :: Apache Software License",
23
+ "Operating System :: OS Independent",
24
+ ],
25
+ python_requires=">=3.10",
26
+ install_requires=[
27
+ "nltk==3.8.1",
28
+ "numpy==1.26.4",
29
+ "scipy==1.13.1",
30
+ "pandas==2.2.2",
31
+ "tqdm==4.66.5",
32
+ "ujson==5.10.0",
33
+ "joblib==1.4.2",
34
+ "datasets==3.0.1",
35
+ "pytrec_eval==0.5",
36
+ ],
37
+ )
output/bm25_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d3bc81c91354ee366eedda530282e3fff9431d7069327c35481c0ff7ca9702
3
+ size 11624459
output/csc_bm25_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0432479f1bca45512e6864a13931538d1f1afed9dc9888febac088d4d2deb4
3
+ size 9522928