saadawaissheikh commited on
Commit
f96d7b6
·
verified ·
1 Parent(s): 9dc8768

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -43
app.py CHANGED
@@ -2,37 +2,28 @@ import os
2
  import gradio as gr
3
  import pdfplumber
4
  import re
 
5
  from langchain.docstore.document import Document
6
- from langchain.vectorstores import FAISS
7
  from langchain.embeddings.base import Embeddings
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from langchain.chains import RetrievalQA
10
  from langchain.prompts import PromptTemplate
11
  from langchain_openai import ChatOpenAI
12
- from transformers import MarianMTModel, MarianTokenizer
 
13
 
14
- # Environment Variables
15
- os.environ["OPENAI_API_KEY"] = os.environ["OPENROUTER_API_KEY"]
16
  os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
17
  os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
18
 
19
- # Urdu-English Translation
20
- en_to_ur_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
21
- en_to_ur_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
22
- ur_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ur-en")
23
- ur_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ur-en")
24
-
25
- def translate(text, src):
26
- if src == "ur":
27
- tokenizer, model = ur_to_en_tokenizer, ur_to_en_model
28
- else:
29
- tokenizer, model = en_to_ur_tokenizer, en_to_ur_model
30
-
31
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
32
- translated = model.generate(**inputs)
33
- return tokenizer.decode(translated[0], skip_special_tokens=True)
34
 
35
- # PDF Cleaner
36
  def extract_clean_sections(file_path):
37
  with pdfplumber.open(file_path) as pdf:
38
  full_text = ""
@@ -42,6 +33,7 @@ def extract_clean_sections(file_path):
42
  text = re.sub(r'Systems Campus.*?Lahore', '', text)
43
  text = re.sub(r'E-mail:.*?systemsltd\.com', '', text)
44
  full_text += text + "\n"
 
45
  pattern = r"(?<=\n)([A-Z][^\n]{3,50}):"
46
  parts = re.split(pattern, full_text)
47
 
@@ -53,18 +45,21 @@ def extract_clean_sections(file_path):
53
  docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
54
  return docs
55
 
56
- # TF-IDF Embedding
57
  class TfidfEmbedding(Embeddings):
58
  def __init__(self):
59
  self.vectorizer = TfidfVectorizer()
 
60
  def fit(self, texts):
61
  self.vectorizer.fit(texts)
 
62
  def embed_documents(self, texts):
63
  return self.vectorizer.transform(texts).toarray()
 
64
  def embed_query(self, text):
65
  return self.vectorizer.transform([text]).toarray()[0]
66
 
67
- # Custom Prompt
68
  TEMPLATE = """
69
  You are a strict healthcare policy checker for Systems Ltd.
70
  Always begin your answer clearly:
@@ -78,14 +73,16 @@ Answer:
78
  """
79
  custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
80
 
81
- # Load and Vectorize Policy
82
- qa_chain = None
83
  def initialize_policy():
84
- global qa_chain
 
85
  docs = extract_clean_sections("healthcare_policy.pdf")
86
  texts = [doc.page_content for doc in docs]
 
87
  embedder = TfidfEmbedding()
88
  embedder.fit(texts)
 
89
  vectordb = FAISS.from_texts(texts, embedder)
90
  retriever = vectordb.as_retriever()
91
 
@@ -108,38 +105,46 @@ def initialize_policy():
108
  chain_type_kwargs={"prompt": custom_prompt}
109
  )
110
 
111
- # QA Handler
 
 
 
 
 
112
  def ask_policy_question(question, language):
113
  if qa_chain is None:
114
- return "⏳ پالیسی لوڈ ہو رہی ہے، براہ کرم انتظار کریں..." if language == "Urdu" else "The policy is still loading. Please wait."
115
  try:
116
- # Translate question if in Urdu
117
- if language == "Urdu":
118
- question = translate(question, src="ur")
119
- answer = qa_chain.run(question)
120
  if language == "Urdu":
121
- answer = translate(answer, src="en")
122
- return answer
 
 
 
 
123
  except Exception as e:
124
- return f"Error: {str(e)}"
 
 
 
125
 
126
- # UI
127
- status_text = "⏳ پالیسی لوڈ ہو رہی ہے..."
128
  with gr.Blocks() as demo:
129
- gr.Markdown("## 🏥 Systems Ltd HealthCare Claim Checker (Bilingual RAG)")
 
130
  status_box = gr.Textbox(label="Status", value=status_text, interactive=False)
131
 
132
- lang = gr.Radio(choices=["English", "Urdu"], label="Select Language", value="English")
133
- question = gr.Textbox(label="Enter your healthcare claim question")
134
- ask_btn = gr.Button("Ask")
135
- answer = gr.Textbox(label="Answer", lines=6)
136
 
137
- ask_btn.click(fn=ask_policy_question, inputs=[question, lang], outputs=answer)
 
138
 
139
  def startup():
140
  global status_text
141
  initialize_policy()
142
- status_text = "Policy loaded. You may now ask questions."
143
  return status_text
144
 
145
  demo.load(fn=startup, outputs=status_box)
 
2
  import gradio as gr
3
  import pdfplumber
4
  import re
5
+
6
  from langchain.docstore.document import Document
7
+ from langchain_community.vectorstores import FAISS # ✅ Fixed deprecation warning
8
  from langchain.embeddings.base import Embeddings
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from langchain.chains import RetrievalQA
11
  from langchain.prompts import PromptTemplate
12
  from langchain_openai import ChatOpenAI
13
+ from transformers import pipeline
14
+
15
 
16
+ # Set OpenRouter API env vars (used by ChatOpenAI)
17
+ os.environ["OPENAI_API_KEY"] = os.environ.get("OPENROUTER_API_KEY")
18
  os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
19
  os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
20
 
21
+ # Global variables
22
+ qa_chain = None
23
+ translator_en2ur = None
24
+ translator_ur2en = None
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # -------------------- PDF Extraction --------------------
27
  def extract_clean_sections(file_path):
28
  with pdfplumber.open(file_path) as pdf:
29
  full_text = ""
 
33
  text = re.sub(r'Systems Campus.*?Lahore', '', text)
34
  text = re.sub(r'E-mail:.*?systemsltd\.com', '', text)
35
  full_text += text + "\n"
36
+
37
  pattern = r"(?<=\n)([A-Z][^\n]{3,50}):"
38
  parts = re.split(pattern, full_text)
39
 
 
45
  docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
46
  return docs
47
 
48
+ # -------------------- TF-IDF Embedder --------------------
49
  class TfidfEmbedding(Embeddings):
50
  def __init__(self):
51
  self.vectorizer = TfidfVectorizer()
52
+
53
  def fit(self, texts):
54
  self.vectorizer.fit(texts)
55
+
56
  def embed_documents(self, texts):
57
  return self.vectorizer.transform(texts).toarray()
58
+
59
  def embed_query(self, text):
60
  return self.vectorizer.transform([text]).toarray()[0]
61
 
62
+ # -------------------- Custom Prompt --------------------
63
  TEMPLATE = """
64
  You are a strict healthcare policy checker for Systems Ltd.
65
  Always begin your answer clearly:
 
73
  """
74
  custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
75
 
76
+ # -------------------- Policy Initialization --------------------
 
77
  def initialize_policy():
78
+ global qa_chain, translator_en2ur, translator_ur2en
79
+
80
  docs = extract_clean_sections("healthcare_policy.pdf")
81
  texts = [doc.page_content for doc in docs]
82
+
83
  embedder = TfidfEmbedding()
84
  embedder.fit(texts)
85
+
86
  vectordb = FAISS.from_texts(texts, embedder)
87
  retriever = vectordb.as_retriever()
88
 
 
105
  chain_type_kwargs={"prompt": custom_prompt}
106
  )
107
 
108
+ # Load translation models
109
+ translator_en2ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
110
+ translator_ur2en = pipeline("translation", model="Helsinki-NLP/opus-mt-ur-en")
111
+
112
+
113
+ # -------------------- QA with Bilingual Support --------------------
114
  def ask_policy_question(question, language):
115
  if qa_chain is None:
116
+ return "The policy is still loading. Please wait."
117
  try:
 
 
 
 
118
  if language == "Urdu":
119
+ question_en = translator_ur2en(question)[0]['translation_text']
120
+ answer_en = qa_chain.run(question_en)
121
+ answer_ur = translator_en2ur(answer_en)[0]['translation_text']
122
+ return answer_ur
123
+ else:
124
+ return qa_chain.run(question)
125
  except Exception as e:
126
+ return f"Error: {str(e)}"
127
+
128
+ # -------------------- Gradio Interface --------------------
129
+ status_text = "Loading..."
130
 
 
 
131
  with gr.Blocks() as demo:
132
+ gr.Markdown("## 📋 SL HealthCare Claim Checker (Bilingual: English / اردو)")
133
+
134
  status_box = gr.Textbox(label="Status", value=status_text, interactive=False)
135
 
136
+ with gr.Row():
137
+ language = gr.Radio(choices=["English", "Urdu"], label="Select Language / زبان منتخب کریں", value="English")
138
+ question = gr.Textbox(label="Enter your claim question / اپنا سوال درج کریں")
139
+ ask_btn = gr.Button("Ask / پوچھیں")
140
 
141
+ answer = gr.Textbox(label="Answer / جواب", lines=6)
142
+ ask_btn.click(fn=ask_policy_question, inputs=[question, language], outputs=answer)
143
 
144
  def startup():
145
  global status_text
146
  initialize_policy()
147
+ status_text = "Policy loaded. You may now ask questions."
148
  return status_text
149
 
150
  demo.load(fn=startup, outputs=status_box)