File size: 4,050 Bytes
021cd91 26a2e48 021cd91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import gradio as gr
import pdfplumber
import re
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
os.environ["OPENAI_API_KEY"] = os.environ["OPENROUTER_API_KEY"]
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
#Section-aware PDF extractor
def extract_clean_sections(file_path):
with pdfplumber.open(file_path) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
text = re.sub(r'Systems Campus.*?Lahore', '', text)
text = re.sub(r'E-mail:.*?systemsltd\.com', '', text)
full_text += text + "\n"
pattern = r"(?<=\n)([A-Z][^\n]{3,50}):"
parts = re.split(pattern, full_text)
docs = []
for i in range(1, len(parts), 2):
title = parts[i].strip()
content = parts[i + 1].strip()
if len(content) > 20:
docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
return docs
#TF-IDF Embedding for RAG
class TfidfEmbedding(Embeddings):
def __init__(self):
self.vectorizer = TfidfVectorizer()
def fit(self, texts):
self.vectorizer.fit(texts)
def embed_documents(self, texts):
return self.vectorizer.transform(texts).toarray()
def embed_query(self, text):
return self.vectorizer.transform([text]).toarray()[0]
# prompt
TEMPLATE = """
You are a strict healthcare policy checker for Systems Ltd.
Always begin your answer clearly:
- Say "Yes, ..." if the claim is valid
- Say "No, ..." if the claim is not valid
- Say "Partially, ..." if it's conditionally allowed
Use the following policy information to support your answer.
{context}
Question: {question}
Answer:
"""
custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
# Global state
retriever = None
qa_chain = None
# β
Process the PDF once when button is clicked
def load_policy():
global retriever, qa_chain
docs = extract_clean_sections("healthcare_policy.pdf")
texts = [doc.page_content for doc in docs]
embedder = TfidfEmbedding()
embedder.fit(texts)
vectordb = FAISS.from_texts(texts, embedder)
retriever = vectordb.as_retriever()
llm = ChatOpenAI(
model="tngtech/deepseek-r1t2-chimera:free",
base_url="https://openrouter.ai/api/v1",
api_key=os.getenv("OPENAI_API_KEY"),
default_headers={
"HTTP-Referer": "https://huggingface.co",
"X-Title": "PDF-RAG"
},
temperature=0.0
)
qa_chain_local = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=False,
chain_type_kwargs={"prompt": custom_prompt}
)
qa_chain = qa_chain_local
return "Policy loaded. You may now ask questions."
# β
Answer a claim question
def ask_policy_question(question):
if qa_chain is None:
return "Please click 'Ask about claim' to load the policy first."
try:
return qa_chain.run(question)
except Exception as e:
return f"Error: {str(e)}"
# β
Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## SL HealthCare Claim Checker (RAG)")
load_btn = gr.Button("π₯ Ask about claim (Load Policy)")
load_status = gr.Textbox(label="Status")
load_btn.click(fn=load_policy, outputs=load_status)
with gr.Row():
question = gr.Textbox(label="Enter your claim question")
ask_btn = gr.Button("Ask")
answer = gr.Textbox(label="Answer", lines=6)
ask_btn.click(fn=ask_policy_question, inputs=question, outputs=answer)
demo.launch()
|