File size: 2,087 Bytes
ade8cf8
21583be
ade8cf8
21583be
ade8cf8
95269b9
ade8cf8
 
 
95269b9
ade8cf8
 
 
21583be
 
 
 
 
 
 
ade8cf8
21583be
ade8cf8
 
21583be
ade8cf8
21583be
ade8cf8
 
 
 
 
 
 
 
 
 
 
 
 
21583be
95269b9
ade8cf8
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from transformers import RagTokenizer, RagTokenForGeneration, AutoTokenizer, AutoModelForCausalLM, pipeline
from pdfminer.high_level import extract_text
from docx import Document
from dataclasses import dataclass
import pandas as pd

# Initialize RAG
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

# Initialize Phi-2
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)

@dataclass
class Paragraph:
    page_num: int
    paragraph_num: int
    content: str

def read_pdf_pdfminer(file_path) -> list[Paragraph]:
    text = extract_text(file_path).replace('\n', ' ').strip()
    paragraphs = text.split(". ")
    return [Paragraph(0, i, para) for i, para in enumerate(paragraphs, 1)]

def read_docx(file) -> list[Paragraph]:
    doc = Document(file)
    return [Paragraph(1, i, para.text.strip()) for i, para in enumerate(doc.paragraphs, 1) if para.text.strip()]

def generate_context_with_rag(question: str) -> str:
    inputs = rag_tokenizer(question, return_tensors="pt")
    output_ids = rag_model.generate(**inputs)
    context = rag_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return context

def generate_answer_with_phi(question: str, context: str) -> str:
    enhanced_question = f"Question: {question}\nContext: {context}\nAnswer:"
    inputs = phi_tokenizer.encode(enhanced_question, return_tensors="pt", max_length=512, truncation=True)
    outputs = phi_model.generate(inputs, max_length=600)
    answer = phi_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

def answer_question(question: str, documents_df: pd.DataFrame) -> str:
    # Assuming documents_df contains the text from uploaded files
    combined_text = " ".join(documents_df['content'].tolist())
    context = generate_context_with_rag(combined_text + " " + question)
    answer = generate_answer_with_phi(question, context)
    return answer