papasega's picture
add files
cf3dd65
import fitz # PyMuPDF
import numpy as np
import itertools as it
import operator as op
from typing import List, Tuple
from sentence_transformers import SentenceTransformer
"""
Sur la partie strategies.py nous allons :
1. convertir notre pdf en texte
2. diviser le texte en plusieur paragraphes (chunks dans le code)
3. transformer le texte en vecteur (embeddings)
4. Aller chercher le bon candicat (trouver la réponse pertinente de l'utilisateur dans le pdf)
"""
def convert_pdf_to_text(pdf_data: str) -> List[str]:
document = fitz.open(pdf_data)
accumulator: List[str] = []
for page_num in range(len(document)):
page = document[page_num]
text = page.get_text()
accumulator.append(text)
document.close()
return accumulator
def split_pages_into_chunks(pages: List[str], chunk_size: int, tokenizer) -> List[str]:
page_tokens: List[List[int]] = [tokenizer.encode(page) for page in pages]
document_tokens = list(it.chain(*page_tokens))
nb_tokens = len(document_tokens)
nb_partitions = round(nb_tokens / chunk_size)
accumulator: List[str] = []
for chunck_tokens in np.array_split(document_tokens, nb_partitions):
paragraph = tokenizer.decode(chunck_tokens)
accumulator.append(paragraph)
return accumulator
def vectorize(chunks: List[str], transformer: SentenceTransformer, device: str = 'cpu') -> List[Tuple[str, np.ndarray]]:
embeddings = transformer.encode(
sentences=chunks,
batch_size=32,
device=device
,show_progress_bar=True,
)
return list(zip(chunks, embeddings))
def find_candidates(query_embedding: np.ndarray, chunks: List[str], corpus_embeddings: np.ndarray, top_k: int = 5) -> List[str]:
dot_product = query_embedding @ corpus_embeddings.T
norms = np.linalg.norm(query_embedding) * np.linalg.norm(corpus_embeddings, axis=1)
weighted_scores = dot_product / norms # Formule de cosine similarity cos(u,v) = u.v / |u||v|
zipped_chunks_scores = list(zip(chunks, weighted_scores))
sorted_chunks_scores = sorted(zipped_chunks_scores, key=op.itemgetter(1), reverse=True)
selected_candidates = sorted_chunks_scores[:top_k]
return list(map(op.itemgetter(0), selected_candidates))