Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import pdfplumber | |
import re | |
from langchain.docstore.document import Document | |
from langchain_community.vectorstores import FAISS # ✅ Fixed deprecation warning | |
from langchain.embeddings.base import Embeddings | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from langchain.chains import RetrievalQA | |
from langchain.prompts import PromptTemplate | |
from langchain_openai import ChatOpenAI | |
from transformers import pipeline | |
# Set OpenRouter API env vars (used by ChatOpenAI) | |
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENROUTER_API_KEY") | |
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" | |
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}' | |
# Global variables | |
qa_chain = None | |
translator_en2ur = None | |
translator_ur2en = None | |
# -------------------- PDF Extraction -------------------- | |
def extract_clean_sections(file_path): | |
with pdfplumber.open(file_path) as pdf: | |
full_text = "" | |
for page in pdf.pages: | |
text = page.extract_text() | |
if text: | |
text = re.sub(r'Systems Campus.*?Lahore', '', text) | |
text = re.sub(r'E-mail:.*?systemsltd\.com', '', text) | |
full_text += text + "\n" | |
pattern = r"(?<=\n)([A-Z][^\n]{3,50}):" | |
parts = re.split(pattern, full_text) | |
docs = [] | |
for i in range(1, len(parts), 2): | |
title = parts[i].strip() | |
content = parts[i + 1].strip() | |
if len(content) > 20: | |
docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title})) | |
return docs | |
# -------------------- TF-IDF Embedder -------------------- | |
class TfidfEmbedding(Embeddings): | |
def __init__(self): | |
self.vectorizer = TfidfVectorizer() | |
def fit(self, texts): | |
self.vectorizer.fit(texts) | |
def embed_documents(self, texts): | |
return self.vectorizer.transform(texts).toarray() | |
def embed_query(self, text): | |
return self.vectorizer.transform([text]).toarray()[0] | |
# -------------------- Custom Prompt -------------------- | |
TEMPLATE = """ | |
You are a strict healthcare policy checker for Systems Ltd. | |
Always begin your answer clearly: | |
- Say "Yes, ..." if the claim is valid | |
- Say "No, ..." if the claim is not valid | |
- Say "Partially, ..." if it's conditionally allowed | |
Use the following policy information to support your answer. | |
{context} | |
Question: {question} | |
Answer: | |
""" | |
custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"]) | |
# -------------------- Policy Initialization -------------------- | |
def initialize_policy(): | |
global qa_chain, translator_en2ur, translator_ur2en | |
docs = extract_clean_sections("healthcare_policy.pdf") | |
texts = [doc.page_content for doc in docs] | |
embedder = TfidfEmbedding() | |
embedder.fit(texts) | |
vectordb = FAISS.from_texts(texts, embedder) | |
retriever = vectordb.as_retriever() | |
llm = ChatOpenAI( | |
model="tngtech/deepseek-r1t2-chimera:free", | |
base_url="https://openrouter.ai/api/v1", | |
api_key=os.getenv("OPENAI_API_KEY"), | |
default_headers={ | |
"HTTP-Referer": "https://huggingface.co", | |
"X-Title": "PDF-RAG" | |
}, | |
temperature=0.0 | |
) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=False, | |
chain_type_kwargs={"prompt": custom_prompt} | |
) | |
# ✅ Load translation models | |
translator_en2ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur") | |
translator_ur2en = pipeline("translation", model="Helsinki-NLP/opus-mt-ur-en") | |
# -------------------- QA with Bilingual Support -------------------- | |
def ask_policy_question(question, language): | |
if qa_chain is None: | |
return "The policy is still loading. Please wait." | |
try: | |
if language == "Urdu": | |
question_en = translator_ur2en(question)[0]['translation_text'] | |
answer_en = qa_chain.run(question_en) | |
answer_ur = translator_en2ur(answer_en)[0]['translation_text'] | |
return answer_ur | |
else: | |
return qa_chain.run(question) | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# -------------------- Gradio Interface -------------------- | |
status_text = "Loading..." | |
with gr.Blocks() as demo: | |
gr.Markdown("## 📋 SL HealthCare Claim Checker (Bilingual: English / اردو)") | |
status_box = gr.Textbox(label="Status", value=status_text, interactive=False) | |
with gr.Row(): | |
language = gr.Radio(choices=["English", "Urdu"], label="Select Language / زبان منتخب کریں", value="English") | |
question = gr.Textbox(label="Enter your claim question / اپنا سوال درج کریں") | |
ask_btn = gr.Button("Ask / پوچھیں") | |
answer = gr.Textbox(label="Answer / جواب", lines=6) | |
ask_btn.click(fn=ask_policy_question, inputs=[question, language], outputs=answer) | |
def startup(): | |
global status_text | |
initialize_policy() | |
status_text = "Policy loaded. You may now ask questions." | |
return status_text | |
demo.load(fn=startup, outputs=status_box) | |
demo.launch() | |