Spaces:
Runtime error
Runtime error
import fitz # PyMuPDF | |
import numpy as np | |
import itertools as it | |
import operator as op | |
from typing import List, Tuple | |
from sentence_transformers import SentenceTransformer | |
""" | |
Sur la partie strategies.py nous allons : | |
1. convertir notre pdf en texte | |
2. diviser le texte en plusieur paragraphes (chunks dans le code) | |
3. transformer le texte en vecteur (embeddings) | |
4. Aller chercher le bon candicat (trouver la réponse pertinente de l'utilisateur dans le pdf) | |
""" | |
def convert_pdf_to_text(pdf_data: str) -> List[str]: | |
document = fitz.open(pdf_data) | |
accumulator: List[str] = [] | |
for page_num in range(len(document)): | |
page = document[page_num] | |
text = page.get_text() | |
accumulator.append(text) | |
document.close() | |
return accumulator | |
def split_pages_into_chunks(pages: List[str], chunk_size: int, tokenizer) -> List[str]: | |
page_tokens: List[List[int]] = [tokenizer.encode(page) for page in pages] | |
document_tokens = list(it.chain(*page_tokens)) | |
nb_tokens = len(document_tokens) | |
nb_partitions = round(nb_tokens / chunk_size) | |
accumulator: List[str] = [] | |
for chunck_tokens in np.array_split(document_tokens, nb_partitions): | |
paragraph = tokenizer.decode(chunck_tokens) | |
accumulator.append(paragraph) | |
return accumulator | |
def vectorize(chunks: List[str], transformer: SentenceTransformer, device: str = 'cpu') -> List[Tuple[str, np.ndarray]]: | |
embeddings = transformer.encode( | |
sentences=chunks, | |
batch_size=32, | |
device=device | |
,show_progress_bar=True, | |
) | |
return list(zip(chunks, embeddings)) | |
def find_candidates(query_embedding: np.ndarray, chunks: List[str], corpus_embeddings: np.ndarray, top_k: int = 5) -> List[str]: | |
dot_product = query_embedding @ corpus_embeddings.T | |
norms = np.linalg.norm(query_embedding) * np.linalg.norm(corpus_embeddings, axis=1) | |
weighted_scores = dot_product / norms # Formule de cosine similarity cos(u,v) = u.v / |u||v| | |
zipped_chunks_scores = list(zip(chunks, weighted_scores)) | |
sorted_chunks_scores = sorted(zipped_chunks_scores, key=op.itemgetter(1), reverse=True) | |
selected_candidates = sorted_chunks_scores[:top_k] | |
return list(map(op.itemgetter(0), selected_candidates)) |