import os import re import gradio as gr import pdfplumber import pytesseract from PIL import Image from langchain.docstore.document import Document from langchain.vectorstores import FAISS from langchain.embeddings.base import Embeddings from sklearn.feature_extraction.text import TfidfVectorizer from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain_openai import ChatOpenAI from langchain.text_splitter import RecursiveCharacterTextSplitter os.environ["OPENAI_API_KEY"] = os.getenv("OPENROUTER_API_KEY") os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co/spaces/saadawaissheikh/SystemsHealthcareChatbot", "X-Title":"PDF Chatbot"}' # ✅ Load PDF once at startup PDF_PATH = "healthcare_policy.pdf" import pytesseract # ✅ Force path to Tesseract binary (required for Hugging Face Spaces) pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" class TfidfEmbedding(Embeddings): def __init__(self): self.vectorizer = TfidfVectorizer() def fit(self, texts): self.vectorizer.fit(texts) def embed_documents(self, texts): return self.vectorizer.transform(texts).toarray() def embed_query(self, text): return self.vectorizer.transform([text]).toarray()[0] def load_pdf_chunks(pdf_path): with pdfplumber.open(pdf_path) as pdf: full_text = "\n".join([page.extract_text() or "" for page in pdf.pages]) splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50) chunks = splitter.split_text(full_text) return [Document(page_content=chunk) for chunk in chunks] def setup_vectordb(docs): texts = [doc.page_content for doc in docs] embedder = TfidfEmbedding() embedder.fit(texts) vectordb = FAISS.from_texts(texts, embedder) return vectordb def get_llm(): return ChatOpenAI( model="tngtech/deepseek-r1t2-chimera:free", temperature=0.0 ) def get_qa_chain(): docs = load_pdf_chunks(PDF_PATH) vectordb = setup_vectordb(docs) retriever = vectordb.as_retriever() prompt = PromptTemplate.from_template("Answer with Yes or No first. Then explain: {context}\nQuestion: {question}") llm = get_llm() return RetrievalQA.from_chain_type( llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=False, chain_type_kwargs={"prompt": prompt} ) qa_chain = get_qa_chain() # ✅ Standard PDF QA def ask_question(query): try: return qa_chain.run(query) except Exception as e: return f"Error: {e}" # ✅ Extract Tablets from Image def extract_tablet_names(text): medicines = [] for line in text.splitlines(): match = re.search(r"\\b([A-Za-z]+(?:\\s+[A-Za-z]+)*)\\s*(\\d+mg|\\d+\\s*mg)?\\b", line) if match: name = match.group(1).strip() if name.lower() not in ["cash", "scaling", "polish"]: medicines.append(name) return list(set(medicines)) def extract_text_from_image(img_path): image = Image.open(img_path) raw_text = pytesseract.image_to_string(image) return extract_tablet_names(raw_text) def check_tablets(img_path): try: # Step 1: Confirm image path is received if not img_path or not os.path.exists(img_path): return "❌ Error: Image path is invalid or file not found." # Step 2: Run OCR image = Image.open(img_path) raw_text = pytesseract.image_to_string(image) # Step 3: Extract medicine names tablets = extract_tablet_names(raw_text) if not tablets: return "❌ No tablets found in the receipt text." # Step 4: Use RAG to check each tablet result = "" for med in tablets: question = f"Is the medicine {med} covered under the healthcare policy?" try: answer = qa_chain.run(question) except Exception as e: answer = f"RAG error: {str(e)}" result += f"💊 {med} → {answer}\n\n" return result.strip() except Exception as e: return f"❌ Critical error during tablet check: {str(e)}" # ✅ Gradio UI with gr.Blocks(title="Healthcare Chatbot") as app: gr.Markdown("# 💬 Systems Healthcare Chatbot") gr.Markdown("📄 Policy document loaded. You may now ask questions or upload a medicine receipt to check claims.") with gr.Tab("Ask about Policy"): with gr.Row(): txt = gr.Textbox(label="Your Question") ans = gr.Textbox(label="Answer") txt.submit(fn=ask_question, inputs=txt, outputs=ans) with gr.Tab("Check Tablet Claim"): with gr.Row(): img = gr.Image(type="filepath", label="Upload Tablet Receipt") out = gr.Textbox(label="Result") img.change(fn=check_tablets, inputs=img, outputs=out) # ✅ Launch App app.launch()