File size: 5,009 Bytes
2ca0835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b633f2
2ca0835
7ce1fea
 
1553e72
7ce1fea
 
1553e72
2ca0835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9fc52e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ca0835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b3e781
 
 
2ca0835
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157


import os
import re
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter


os.environ["OPENAI_API_KEY"] = os.getenv("OPENROUTER_API_KEY")
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co/spaces/saadawaissheikh/SystemsHealthcareChatbot", "X-Title":"PDF Chatbot"}'

# βœ… Load PDF once at startup
PDF_PATH = "healthcare_policy.pdf"

import pytesseract

# βœ… Force path to Tesseract binary (required for Hugging Face Spaces)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


class TfidfEmbedding(Embeddings):
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit(self, texts):
        self.vectorizer.fit(texts)

    def embed_documents(self, texts):
        return self.vectorizer.transform(texts).toarray()

    def embed_query(self, text):
        return self.vectorizer.transform([text]).toarray()[0]

def load_pdf_chunks(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = splitter.split_text(full_text)
    return [Document(page_content=chunk) for chunk in chunks]

def setup_vectordb(docs):
    texts = [doc.page_content for doc in docs]
    embedder = TfidfEmbedding()
    embedder.fit(texts)
    vectordb = FAISS.from_texts(texts, embedder)
    return vectordb

def get_llm():
    return ChatOpenAI(
        model="tngtech/deepseek-r1t2-chimera:free",
        temperature=0.0
    )

def get_qa_chain():
    docs = load_pdf_chunks(PDF_PATH)
    vectordb = setup_vectordb(docs)
    retriever = vectordb.as_retriever()
    prompt = PromptTemplate.from_template("Answer with Yes or No first. Then explain: {context}\nQuestion: {question}")
    llm = get_llm()
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=False,
        chain_type_kwargs={"prompt": prompt}
    )

qa_chain = get_qa_chain()

# βœ… Standard PDF QA
def ask_question(query):
    try:
        return qa_chain.run(query)
    except Exception as e:
        return f"Error: {e}"

# βœ… Extract Tablets from Image
def extract_tablet_names(text):
    medicines = []
    for line in text.splitlines():
        match = re.search(r"\\b([A-Za-z]+(?:\\s+[A-Za-z]+)*)\\s*(\\d+mg|\\d+\\s*mg)?\\b", line)
        if match:
            name = match.group(1).strip()
            if name.lower() not in ["cash", "scaling", "polish"]:
                medicines.append(name)
    return list(set(medicines))

def extract_text_from_image(img_path):
    image = Image.open(img_path)
    raw_text = pytesseract.image_to_string(image)
    return extract_tablet_names(raw_text)

def check_tablets(img_path):
    try:
        # Step 1: Confirm image path is received
        if not img_path or not os.path.exists(img_path):
            return "❌ Error: Image path is invalid or file not found."

        # Step 2: Run OCR
        image = Image.open(img_path)
        raw_text = pytesseract.image_to_string(image)

        # Step 3: Extract medicine names
        tablets = extract_tablet_names(raw_text)
        if not tablets:
            return "❌ No tablets found in the receipt text."

        # Step 4: Use RAG to check each tablet
        result = ""
        for med in tablets:
            question = f"Is the medicine {med} covered under the healthcare policy?"
            try:
                answer = qa_chain.run(question)
            except Exception as e:
                answer = f"RAG error: {str(e)}"
            result += f"πŸ’Š {med} β†’ {answer}\n\n"

        return result.strip()

    except Exception as e:
        return f"❌ Critical error during tablet check: {str(e)}"


# βœ… Gradio UI
with gr.Blocks(title="Healthcare Chatbot") as app:
    gr.Markdown("# πŸ’¬ Systems Healthcare Chatbot")
    gr.Markdown("πŸ“„ Policy document loaded. You may now ask questions or upload a medicine receipt to check claims.")

    with gr.Tab("Ask about Policy"):
        with gr.Row():
            txt = gr.Textbox(label="Your Question")
            ans = gr.Textbox(label="Answer")
        txt.submit(fn=ask_question, inputs=txt, outputs=ans)

    with gr.Tab("Check Tablet Claim"):
        with gr.Row():
            img = gr.Image(type="filepath", label="Upload Tablet Receipt")
            out = gr.Textbox(label="Result")
        img.change(fn=check_tablets, inputs=img, outputs=out)




# βœ… Launch App
app.launch()