File size: 1,735 Bytes
a95b710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import fitz  # pip install pymupdf
from unidecode import unidecode
from nltk.tokenize import sent_tokenize
import bm25s


def retrieve_from_pdf(pdf_file, query, k=10):

    # Get PDF file as binary
    with open(pdf_file, mode="rb") as f:
        pdf_file_bytes = f.read()

    # Extract text from the PDF
    pdf_doc = fitz.open(stream=pdf_file_bytes, filetype="pdf")
    pdf_text = ""
    for page_num in range(pdf_doc.page_count):
        page = pdf_doc.load_page(page_num)
        pdf_text += page.get_text("text")

    # Clean text
    # pdf_text = 'In §3.1, we find\nthat dis-\ntractor abstracts.')
    # clean_text = 'In SS3.1, we find that distractor abstracts.'
    # Remove hyphens at end of lines
    clean_text = re.sub("-\n", "", pdf_text)
    # Replace remaining newline characters with space
    clean_text = re.sub("\n", " ", clean_text)
    # Replace unicode with ascii
    clean_text = unidecode(clean_text)

    # Parse text into sentences to build the corpus
    corpus = sent_tokenize(clean_text)
    # Tokenize the corpus
    corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
    # Initialize the BM25 model
    retriever = bm25s.BM25()
    retriever.index(corpus_tokens, show_progress=False)
    # Tokenize the query
    query_tokens = bm25s.tokenize(query)

    # Get top-k results
    # Use int(k) in case we get str value (as in retrieval example)
    results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=int(k))
    ## Print results
    # for i in range(results.shape[1]):
    #    doc, score = results[0, i], scores[0, i]
    #    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

    # Join sentences and return results
    results = " ".join(results[0])
    return results