saadawaissheikh's picture
Update app.py
26a2e48 verified
raw
history blame
4.05 kB
import os
import gradio as gr
import pdfplumber
import re
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
os.environ["OPENAI_API_KEY"] = os.environ["OPENROUTER_API_KEY"]
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
#Section-aware PDF extractor
def extract_clean_sections(file_path):
with pdfplumber.open(file_path) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
text = re.sub(r'Systems Campus.*?Lahore', '', text)
text = re.sub(r'E-mail:.*?systemsltd\.com', '', text)
full_text += text + "\n"
pattern = r"(?<=\n)([A-Z][^\n]{3,50}):"
parts = re.split(pattern, full_text)
docs = []
for i in range(1, len(parts), 2):
title = parts[i].strip()
content = parts[i + 1].strip()
if len(content) > 20:
docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
return docs
#TF-IDF Embedding for RAG
class TfidfEmbedding(Embeddings):
def __init__(self):
self.vectorizer = TfidfVectorizer()
def fit(self, texts):
self.vectorizer.fit(texts)
def embed_documents(self, texts):
return self.vectorizer.transform(texts).toarray()
def embed_query(self, text):
return self.vectorizer.transform([text]).toarray()[0]
# prompt
TEMPLATE = """
You are a strict healthcare policy checker for Systems Ltd.
Always begin your answer clearly:
- Say "Yes, ..." if the claim is valid
- Say "No, ..." if the claim is not valid
- Say "Partially, ..." if it's conditionally allowed
Use the following policy information to support your answer.
{context}
Question: {question}
Answer:
"""
custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
# Global state
retriever = None
qa_chain = None
# βœ… Process the PDF once when button is clicked
def load_policy():
global retriever, qa_chain
docs = extract_clean_sections("healthcare_policy.pdf")
texts = [doc.page_content for doc in docs]
embedder = TfidfEmbedding()
embedder.fit(texts)
vectordb = FAISS.from_texts(texts, embedder)
retriever = vectordb.as_retriever()
llm = ChatOpenAI(
model="tngtech/deepseek-r1t2-chimera:free",
base_url="https://openrouter.ai/api/v1",
api_key=os.getenv("OPENAI_API_KEY"),
default_headers={
"HTTP-Referer": "https://huggingface.co",
"X-Title": "PDF-RAG"
},
temperature=0.0
)
qa_chain_local = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=False,
chain_type_kwargs={"prompt": custom_prompt}
)
qa_chain = qa_chain_local
return "Policy loaded. You may now ask questions."
# βœ… Answer a claim question
def ask_policy_question(question):
if qa_chain is None:
return "Please click 'Ask about claim' to load the policy first."
try:
return qa_chain.run(question)
except Exception as e:
return f"Error: {str(e)}"
# βœ… Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## SL HealthCare Claim Checker (RAG)")
load_btn = gr.Button("πŸ“₯ Ask about claim (Load Policy)")
load_status = gr.Textbox(label="Status")
load_btn.click(fn=load_policy, outputs=load_status)
with gr.Row():
question = gr.Textbox(label="Enter your claim question")
ask_btn = gr.Button("Ask")
answer = gr.Textbox(label="Answer", lines=6)
ask_btn.click(fn=ask_policy_question, inputs=question, outputs=answer)
demo.launch()