|
import os |
|
import gradio as gr |
|
import pdfplumber |
|
import re |
|
|
|
from langchain.docstore.document import Document |
|
from langchain.vectorstores import FAISS |
|
from langchain.embeddings.base import Embeddings |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from langchain.chains import RetrievalQA |
|
from langchain.prompts import PromptTemplate |
|
from langchain_openai import ChatOpenAI |
|
|
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = os.environ["OPENROUTER_API_KEY"] |
|
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" |
|
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}' |
|
|
|
|
|
def extract_clean_sections(file_path): |
|
with pdfplumber.open(file_path) as pdf: |
|
full_text = "" |
|
for page in pdf.pages: |
|
text = page.extract_text() |
|
if text: |
|
text = re.sub(r'Systems Campus.*?Lahore', '', text) |
|
text = re.sub(r'E-mail:.*?systemsltd\.com', '', text) |
|
full_text += text + "\n" |
|
|
|
pattern = r"(?<=\n)([A-Z][^\n]{3,50}):" |
|
parts = re.split(pattern, full_text) |
|
|
|
docs = [] |
|
for i in range(1, len(parts), 2): |
|
title = parts[i].strip() |
|
content = parts[i + 1].strip() |
|
if len(content) > 20: |
|
docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title})) |
|
return docs |
|
|
|
|
|
class TfidfEmbedding(Embeddings): |
|
def __init__(self): |
|
self.vectorizer = TfidfVectorizer() |
|
|
|
def fit(self, texts): |
|
self.vectorizer.fit(texts) |
|
|
|
def embed_documents(self, texts): |
|
return self.vectorizer.transform(texts).toarray() |
|
|
|
def embed_query(self, text): |
|
return self.vectorizer.transform([text]).toarray()[0] |
|
|
|
|
|
TEMPLATE = """ |
|
You are a strict healthcare policy checker for Systems Ltd. |
|
|
|
Always begin your answer clearly: |
|
- Say "Yes, ..." if the claim is valid |
|
- Say "No, ..." if the claim is not valid |
|
- Say "Partially, ..." if it's conditionally allowed |
|
|
|
Use the following policy information to support your answer. |
|
|
|
{context} |
|
|
|
Question: {question} |
|
Answer: |
|
""" |
|
custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"]) |
|
|
|
|
|
|
|
def initialize_policy(): |
|
global qa_chain |
|
docs = extract_clean_sections("healthcare_policy.pdf") |
|
texts = [doc.page_content for doc in docs] |
|
embedder = TfidfEmbedding() |
|
embedder.fit(texts) |
|
vectordb = FAISS.from_texts(texts, embedder) |
|
retriever = vectordb.as_retriever() |
|
|
|
llm = ChatOpenAI( |
|
model="tngtech/deepseek-r1t2-chimera:free", |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
default_headers={ |
|
"HTTP-Referer": "https://huggingface.co", |
|
"X-Title": "PDF-RAG" |
|
}, |
|
temperature=0.0 |
|
) |
|
|
|
qa_chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
chain_type="stuff", |
|
retriever=retriever, |
|
return_source_documents=False, |
|
chain_type_kwargs={"prompt": custom_prompt} |
|
) |
|
|
|
|
|
def ask_policy_question(question): |
|
if qa_chain is None: |
|
return "The policy is still loading. Please wait." |
|
try: |
|
return qa_chain.run(question) |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
qa_chain = None |
|
status_text = "Loading..." |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## SL HealthCare Claim Checker (RAG)") |
|
status_box = gr.Textbox(label="Status", value=status_text, interactive=False) |
|
|
|
with gr.Row(): |
|
question = gr.Textbox(label="Enter your claim question") |
|
ask_btn = gr.Button("Ask") |
|
|
|
answer = gr.Textbox(label="Answer", lines=6) |
|
ask_btn.click(fn=ask_policy_question, inputs=question, outputs=answer) |
|
|
|
|
|
def startup(): |
|
global status_text |
|
initialize_policy() |
|
status_text = "Policy loaded. You may now ask questions." |
|
return status_text |
|
|
|
demo.load(fn=startup, outputs=status_box) |
|
|
|
demo.launch() |
|
|