Spaces:
Sleeping
Sleeping
import os | |
import re | |
import gradio as gr | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
from langchain.docstore.document import Document | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings.base import Embeddings | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from langchain.chains import RetrievalQA | |
from langchain.prompts import PromptTemplate | |
from langchain_openai import ChatOpenAI | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
os.environ["OPENAI_API_KEY"] = os.getenv("OPENROUTER_API_KEY") | |
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" | |
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co/spaces/saadawaissheikh/SystemsHealthcareChatbot", "X-Title":"PDF Chatbot"}' | |
# β Load PDF once at startup | |
PDF_PATH = "healthcare_policy.pdf" | |
import pytesseract | |
# β Force path to Tesseract binary (required for Hugging Face Spaces) | |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" | |
class TfidfEmbedding(Embeddings): | |
def __init__(self): | |
self.vectorizer = TfidfVectorizer() | |
def fit(self, texts): | |
self.vectorizer.fit(texts) | |
def embed_documents(self, texts): | |
return self.vectorizer.transform(texts).toarray() | |
def embed_query(self, text): | |
return self.vectorizer.transform([text]).toarray()[0] | |
def load_pdf_chunks(pdf_path): | |
with pdfplumber.open(pdf_path) as pdf: | |
full_text = "\n".join([page.extract_text() or "" for page in pdf.pages]) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50) | |
chunks = splitter.split_text(full_text) | |
return [Document(page_content=chunk) for chunk in chunks] | |
def setup_vectordb(docs): | |
texts = [doc.page_content for doc in docs] | |
embedder = TfidfEmbedding() | |
embedder.fit(texts) | |
vectordb = FAISS.from_texts(texts, embedder) | |
return vectordb | |
def get_llm(): | |
return ChatOpenAI( | |
model="tngtech/deepseek-r1t2-chimera:free", | |
temperature=0.0 | |
) | |
def get_qa_chain(): | |
docs = load_pdf_chunks(PDF_PATH) | |
vectordb = setup_vectordb(docs) | |
retriever = vectordb.as_retriever() | |
prompt = PromptTemplate.from_template("Answer with Yes or No first. Then explain: {context}\nQuestion: {question}") | |
llm = get_llm() | |
return RetrievalQA.from_chain_type( | |
llm=llm, | |
retriever=retriever, | |
chain_type="stuff", | |
return_source_documents=False, | |
chain_type_kwargs={"prompt": prompt} | |
) | |
qa_chain = get_qa_chain() | |
# β Standard PDF QA | |
def ask_question(query): | |
try: | |
return qa_chain.run(query) | |
except Exception as e: | |
return f"Error: {e}" | |
# β Extract Tablets from Image | |
def extract_tablet_names(text): | |
medicines = [] | |
for line in text.splitlines(): | |
match = re.search(r"\\b([A-Za-z]+(?:\\s+[A-Za-z]+)*)\\s*(\\d+mg|\\d+\\s*mg)?\\b", line) | |
if match: | |
name = match.group(1).strip() | |
if name.lower() not in ["cash", "scaling", "polish"]: | |
medicines.append(name) | |
return list(set(medicines)) | |
def extract_text_from_image(img_path): | |
image = Image.open(img_path) | |
raw_text = pytesseract.image_to_string(image) | |
return extract_tablet_names(raw_text) | |
def check_tablets(img_path): | |
try: | |
# Step 1: Confirm image path is received | |
if not img_path or not os.path.exists(img_path): | |
return "β Error: Image path is invalid or file not found." | |
# Step 2: Run OCR | |
image = Image.open(img_path) | |
raw_text = pytesseract.image_to_string(image) | |
# Step 3: Extract medicine names | |
tablets = extract_tablet_names(raw_text) | |
if not tablets: | |
return "β No tablets found in the receipt text." | |
# Step 4: Use RAG to check each tablet | |
result = "" | |
for med in tablets: | |
question = f"Is the medicine {med} covered under the healthcare policy?" | |
try: | |
answer = qa_chain.run(question) | |
except Exception as e: | |
answer = f"RAG error: {str(e)}" | |
result += f"π {med} β {answer}\n\n" | |
return result.strip() | |
except Exception as e: | |
return f"β Critical error during tablet check: {str(e)}" | |
# β Gradio UI | |
with gr.Blocks(title="Healthcare Chatbot") as app: | |
gr.Markdown("# π¬ Systems Healthcare Chatbot") | |
gr.Markdown("π Policy document loaded. You may now ask questions or upload a medicine receipt to check claims.") | |
with gr.Tab("Ask about Policy"): | |
with gr.Row(): | |
txt = gr.Textbox(label="Your Question") | |
ans = gr.Textbox(label="Answer") | |
txt.submit(fn=ask_question, inputs=txt, outputs=ans) | |
with gr.Tab("Check Tablet Claim"): | |
with gr.Row(): | |
img = gr.Image(type="filepath", label="Upload Tablet Receipt") | |
out = gr.Textbox(label="Result") | |
img.change(fn=check_tablets, inputs=img, outputs=out) | |
# β Launch App | |
app.launch() | |