import re import fitz # pip install pymupdf from unidecode import unidecode from nltk.tokenize import sent_tokenize import bm25s def retrieve_from_pdf(pdf_file, query, k=10): # Get PDF file as binary with open(pdf_file, mode="rb") as f: pdf_file_bytes = f.read() # Extract text from the PDF pdf_doc = fitz.open(stream=pdf_file_bytes, filetype="pdf") pdf_text = "" for page_num in range(pdf_doc.page_count): page = pdf_doc.load_page(page_num) pdf_text += page.get_text("text") # Clean text # pdf_text = 'In §3.1, we find\nthat dis-\ntractor abstracts.') # clean_text = 'In SS3.1, we find that distractor abstracts.' # Remove hyphens at end of lines clean_text = re.sub("-\n", "", pdf_text) # Replace remaining newline characters with space clean_text = re.sub("\n", " ", clean_text) # Replace unicode with ascii clean_text = unidecode(clean_text) # Parse text into sentences to build the corpus corpus = sent_tokenize(clean_text) # Tokenize the corpus corpus_tokens = bm25s.tokenize(corpus, stopwords="en") # Initialize the BM25 model retriever = bm25s.BM25() retriever.index(corpus_tokens, show_progress=False) # Tokenize the query query_tokens = bm25s.tokenize(query) # Get top-k results # Use int(k) in case we get str value (as in retrieval example) results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=int(k)) ## Print results # for i in range(results.shape[1]): # doc, score = results[0, i], scores[0, i] # print(f"Rank {i+1} (score: {score:.2f}): {doc}") # Join sentences and return results results = " ".join(results[0]) return results