|
from transformers import RagTokenizer, RagTokenForGeneration, AutoTokenizer, AutoModelForCausalLM, pipeline |
|
from pdfminer.high_level import extract_text |
|
from docx import Document |
|
from dataclasses import dataclass |
|
import pandas as pd |
|
|
|
|
|
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") |
|
rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq") |
|
|
|
|
|
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True) |
|
phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True) |
|
|
|
@dataclass |
|
class Paragraph: |
|
page_num: int |
|
paragraph_num: int |
|
content: str |
|
|
|
def read_pdf_pdfminer(file_path) -> list[Paragraph]: |
|
text = extract_text(file_path).replace('\n', ' ').strip() |
|
paragraphs = text.split(". ") |
|
return [Paragraph(0, i, para) for i, para in enumerate(paragraphs, 1)] |
|
|
|
def read_docx(file) -> list[Paragraph]: |
|
doc = Document(file) |
|
return [Paragraph(1, i, para.text.strip()) for i, para in enumerate(doc.paragraphs, 1) if para.text.strip()] |
|
|
|
def generate_context_with_rag(question: str) -> str: |
|
inputs = rag_tokenizer(question, return_tensors="pt") |
|
output_ids = rag_model.generate(**inputs) |
|
context = rag_tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
return context |
|
|
|
def generate_answer_with_phi(question: str, context: str) -> str: |
|
enhanced_question = f"Question: {question}\nContext: {context}\nAnswer:" |
|
inputs = phi_tokenizer.encode(enhanced_question, return_tensors="pt", max_length=512, truncation=True) |
|
outputs = phi_model.generate(inputs, max_length=600) |
|
answer = phi_tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return answer |
|
|
|
def answer_question(question: str, documents_df: pd.DataFrame) -> str: |
|
|
|
combined_text = " ".join(documents_df['content'].tolist()) |
|
context = generate_context_with_rag(combined_text + " " + question) |
|
answer = generate_answer_with_phi(question, context) |
|
return answer |
|
|