import fitz # PyMuPDF import numpy as np import itertools as it import operator as op from typing import List, Tuple from sentence_transformers import SentenceTransformer """ Sur la partie strategies.py nous allons : 1. convertir notre pdf en texte 2. diviser le texte en plusieur paragraphes (chunks dans le code) 3. transformer le texte en vecteur (embeddings) 4. Aller chercher le bon candicat (trouver la réponse pertinente de l'utilisateur dans le pdf) """ def convert_pdf_to_text(pdf_data: str) -> List[str]: document = fitz.open(pdf_data) accumulator: List[str] = [] for page_num in range(len(document)): page = document[page_num] text = page.get_text() accumulator.append(text) document.close() return accumulator def split_pages_into_chunks(pages: List[str], chunk_size: int, tokenizer) -> List[str]: page_tokens: List[List[int]] = [tokenizer.encode(page) for page in pages] document_tokens = list(it.chain(*page_tokens)) nb_tokens = len(document_tokens) nb_partitions = round(nb_tokens / chunk_size) accumulator: List[str] = [] for chunck_tokens in np.array_split(document_tokens, nb_partitions): paragraph = tokenizer.decode(chunck_tokens) accumulator.append(paragraph) return accumulator def vectorize(chunks: List[str], transformer: SentenceTransformer, device: str = 'cpu') -> List[Tuple[str, np.ndarray]]: embeddings = transformer.encode( sentences=chunks, batch_size=32, device=device ,show_progress_bar=True, ) return list(zip(chunks, embeddings)) def find_candidates(query_embedding: np.ndarray, chunks: List[str], corpus_embeddings: np.ndarray, top_k: int = 5) -> List[str]: dot_product = query_embedding @ corpus_embeddings.T norms = np.linalg.norm(query_embedding) * np.linalg.norm(corpus_embeddings, axis=1) weighted_scores = dot_product / norms # Formule de cosine similarity cos(u,v) = u.v / |u||v| zipped_chunks_scores = list(zip(chunks, weighted_scores)) sorted_chunks_scores = sorted(zipped_chunks_scores, key=op.itemgetter(1), reverse=True) selected_candidates = sorted_chunks_scores[:top_k] return list(map(op.itemgetter(0), selected_candidates))