Spaces:
Sleeping
Sleeping
File size: 5,009 Bytes
2ca0835 2b633f2 2ca0835 7ce1fea 1553e72 7ce1fea 1553e72 2ca0835 f9fc52e 2ca0835 6b3e781 2ca0835 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import re
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
os.environ["OPENAI_API_KEY"] = os.getenv("OPENROUTER_API_KEY")
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co/spaces/saadawaissheikh/SystemsHealthcareChatbot", "X-Title":"PDF Chatbot"}'
# β
Load PDF once at startup
PDF_PATH = "healthcare_policy.pdf"
import pytesseract
# β
Force path to Tesseract binary (required for Hugging Face Spaces)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
class TfidfEmbedding(Embeddings):
def __init__(self):
self.vectorizer = TfidfVectorizer()
def fit(self, texts):
self.vectorizer.fit(texts)
def embed_documents(self, texts):
return self.vectorizer.transform(texts).toarray()
def embed_query(self, text):
return self.vectorizer.transform([text]).toarray()[0]
def load_pdf_chunks(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunks = splitter.split_text(full_text)
return [Document(page_content=chunk) for chunk in chunks]
def setup_vectordb(docs):
texts = [doc.page_content for doc in docs]
embedder = TfidfEmbedding()
embedder.fit(texts)
vectordb = FAISS.from_texts(texts, embedder)
return vectordb
def get_llm():
return ChatOpenAI(
model="tngtech/deepseek-r1t2-chimera:free",
temperature=0.0
)
def get_qa_chain():
docs = load_pdf_chunks(PDF_PATH)
vectordb = setup_vectordb(docs)
retriever = vectordb.as_retriever()
prompt = PromptTemplate.from_template("Answer with Yes or No first. Then explain: {context}\nQuestion: {question}")
llm = get_llm()
return RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
chain_type="stuff",
return_source_documents=False,
chain_type_kwargs={"prompt": prompt}
)
qa_chain = get_qa_chain()
# β
Standard PDF QA
def ask_question(query):
try:
return qa_chain.run(query)
except Exception as e:
return f"Error: {e}"
# β
Extract Tablets from Image
def extract_tablet_names(text):
medicines = []
for line in text.splitlines():
match = re.search(r"\\b([A-Za-z]+(?:\\s+[A-Za-z]+)*)\\s*(\\d+mg|\\d+\\s*mg)?\\b", line)
if match:
name = match.group(1).strip()
if name.lower() not in ["cash", "scaling", "polish"]:
medicines.append(name)
return list(set(medicines))
def extract_text_from_image(img_path):
image = Image.open(img_path)
raw_text = pytesseract.image_to_string(image)
return extract_tablet_names(raw_text)
def check_tablets(img_path):
try:
# Step 1: Confirm image path is received
if not img_path or not os.path.exists(img_path):
return "β Error: Image path is invalid or file not found."
# Step 2: Run OCR
image = Image.open(img_path)
raw_text = pytesseract.image_to_string(image)
# Step 3: Extract medicine names
tablets = extract_tablet_names(raw_text)
if not tablets:
return "β No tablets found in the receipt text."
# Step 4: Use RAG to check each tablet
result = ""
for med in tablets:
question = f"Is the medicine {med} covered under the healthcare policy?"
try:
answer = qa_chain.run(question)
except Exception as e:
answer = f"RAG error: {str(e)}"
result += f"π {med} β {answer}\n\n"
return result.strip()
except Exception as e:
return f"β Critical error during tablet check: {str(e)}"
# β
Gradio UI
with gr.Blocks(title="Healthcare Chatbot") as app:
gr.Markdown("# π¬ Systems Healthcare Chatbot")
gr.Markdown("π Policy document loaded. You may now ask questions or upload a medicine receipt to check claims.")
with gr.Tab("Ask about Policy"):
with gr.Row():
txt = gr.Textbox(label="Your Question")
ans = gr.Textbox(label="Answer")
txt.submit(fn=ask_question, inputs=txt, outputs=ans)
with gr.Tab("Check Tablet Claim"):
with gr.Row():
img = gr.Image(type="filepath", label="Upload Tablet Receipt")
out = gr.Textbox(label="Result")
img.change(fn=check_tablets, inputs=img, outputs=out)
# β
Launch App
app.launch()
|