from transformers import RagTokenizer, RagTokenForGeneration, AutoTokenizer, AutoModelForCausalLM, pipeline from pdfminer.high_level import extract_text from docx import Document from dataclasses import dataclass import pandas as pd # Initialize RAG rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq") # Initialize Phi-2 phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True) phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True) @dataclass class Paragraph: page_num: int paragraph_num: int content: str def read_pdf_pdfminer(file_path) -> list[Paragraph]: text = extract_text(file_path).replace('\n', ' ').strip() paragraphs = text.split(". ") return [Paragraph(0, i, para) for i, para in enumerate(paragraphs, 1)] def read_docx(file) -> list[Paragraph]: doc = Document(file) return [Paragraph(1, i, para.text.strip()) for i, para in enumerate(doc.paragraphs, 1) if para.text.strip()] def generate_context_with_rag(question: str) -> str: inputs = rag_tokenizer(question, return_tensors="pt") output_ids = rag_model.generate(**inputs) context = rag_tokenizer.decode(output_ids[0], skip_special_tokens=True) return context def generate_answer_with_phi(question: str, context: str) -> str: enhanced_question = f"Question: {question}\nContext: {context}\nAnswer:" inputs = phi_tokenizer.encode(enhanced_question, return_tensors="pt", max_length=512, truncation=True) outputs = phi_model.generate(inputs, max_length=600) answer = phi_tokenizer.decode(outputs[0], skip_special_tokens=True) return answer def answer_question(question: str, documents_df: pd.DataFrame) -> str: # Assuming documents_df contains the text from uploaded files combined_text = " ".join(documents_df['content'].tolist()) context = generate_context_with_rag(combined_text + " " + question) answer = generate_answer_with_phi(question, context) return answer