saadawaissheikh commited on
Commit
ef9bc84
·
verified ·
1 Parent(s): 0c024a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -4,7 +4,7 @@ import pdfplumber
4
  import re
5
 
6
  from langchain.docstore.document import Document
7
- from langchain.vectorstores import FAISS
8
  from langchain.embeddings.base import Embeddings
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from langchain.chains import RetrievalQA
@@ -12,16 +12,18 @@ from langchain.prompts import PromptTemplate
12
  from langchain_openai import ChatOpenAI
13
  from transformers import pipeline
14
 
15
- # Hugging Face-compatible OpenRouter setup
 
16
  os.environ["OPENAI_API_KEY"] = os.environ.get("OPENROUTER_API_KEY")
17
  os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
18
  os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
19
 
20
- # Translation models (global)
 
21
  translator_en2ur = None
22
  translator_ur2en = None
23
 
24
- # Load and clean the PDF
25
  def extract_clean_sections(file_path):
26
  with pdfplumber.open(file_path) as pdf:
27
  full_text = ""
@@ -43,7 +45,7 @@ def extract_clean_sections(file_path):
43
  docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
44
  return docs
45
 
46
- # TF-IDF Embedding for LangChain
47
  class TfidfEmbedding(Embeddings):
48
  def __init__(self):
49
  self.vectorizer = TfidfVectorizer()
@@ -57,7 +59,7 @@ class TfidfEmbedding(Embeddings):
57
  def embed_query(self, text):
58
  return self.vectorizer.transform([text]).toarray()[0]
59
 
60
- # Prompt template
61
  TEMPLATE = """
62
  You are a strict healthcare policy checker for Systems Ltd.
63
  Always begin your answer clearly:
@@ -71,15 +73,16 @@ Answer:
71
  """
72
  custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
73
 
74
- # Initialize policy + QA chain + translation models
75
- qa_chain = None
76
-
77
  def initialize_policy():
78
  global qa_chain, translator_en2ur, translator_ur2en
 
79
  docs = extract_clean_sections("healthcare_policy.pdf")
80
  texts = [doc.page_content for doc in docs]
 
81
  embedder = TfidfEmbedding()
82
  embedder.fit(texts)
 
83
  vectordb = FAISS.from_texts(texts, embedder)
84
  retriever = vectordb.as_retriever()
85
 
@@ -102,12 +105,13 @@ def initialize_policy():
102
  chain_type_kwargs={"prompt": custom_prompt}
103
  )
104
 
 
105
  translator_en2ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
106
  translator_ur2en = pipeline("translation", model="Helsinki-NLP/opus-mt-ur-en")
107
 
108
- # QA logic with bilingual support
109
- def ask_policy_question(inputs):
110
- question, language = inputs
111
  if qa_chain is None:
112
  return "The policy is still loading. Please wait."
113
  try:
@@ -121,7 +125,7 @@ def ask_policy_question(inputs):
121
  except Exception as e:
122
  return f"Error: {str(e)}"
123
 
124
- # Gradio UI
125
  status_text = "Loading..."
126
 
127
  with gr.Blocks() as demo:
 
4
  import re
5
 
6
  from langchain.docstore.document import Document
7
+ from langchain_community.vectorstores import FAISS # ✅ Fixed deprecation warning
8
  from langchain.embeddings.base import Embeddings
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from langchain.chains import RetrievalQA
 
12
  from langchain_openai import ChatOpenAI
13
  from transformers import pipeline
14
 
15
+
16
+ # Set OpenRouter API env vars (used by ChatOpenAI)
17
  os.environ["OPENAI_API_KEY"] = os.environ.get("OPENROUTER_API_KEY")
18
  os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
19
  os.environ["OPENAI_API_HEADERS"] = '{"HTTP-Referer":"https://huggingface.co", "X-Title":"PDF-RAG"}'
20
 
21
+ # Global variables
22
+ qa_chain = None
23
  translator_en2ur = None
24
  translator_ur2en = None
25
 
26
+ # -------------------- PDF Extraction --------------------
27
  def extract_clean_sections(file_path):
28
  with pdfplumber.open(file_path) as pdf:
29
  full_text = ""
 
45
  docs.append(Document(page_content=f"{title}:\n{content}", metadata={"section": title}))
46
  return docs
47
 
48
+ # -------------------- TF-IDF Embedder --------------------
49
  class TfidfEmbedding(Embeddings):
50
  def __init__(self):
51
  self.vectorizer = TfidfVectorizer()
 
59
  def embed_query(self, text):
60
  return self.vectorizer.transform([text]).toarray()[0]
61
 
62
+ # -------------------- Custom Prompt --------------------
63
  TEMPLATE = """
64
  You are a strict healthcare policy checker for Systems Ltd.
65
  Always begin your answer clearly:
 
73
  """
74
  custom_prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
75
 
76
+ # -------------------- Policy Initialization --------------------
 
 
77
  def initialize_policy():
78
  global qa_chain, translator_en2ur, translator_ur2en
79
+
80
  docs = extract_clean_sections("healthcare_policy.pdf")
81
  texts = [doc.page_content for doc in docs]
82
+
83
  embedder = TfidfEmbedding()
84
  embedder.fit(texts)
85
+
86
  vectordb = FAISS.from_texts(texts, embedder)
87
  retriever = vectordb.as_retriever()
88
 
 
105
  chain_type_kwargs={"prompt": custom_prompt}
106
  )
107
 
108
+ # ✅ Load translation models
109
  translator_en2ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
110
  translator_ur2en = pipeline("translation", model="Helsinki-NLP/opus-mt-ur-en")
111
 
112
+
113
+ # -------------------- QA with Bilingual Support --------------------
114
+ def ask_policy_question(question, language):
115
  if qa_chain is None:
116
  return "The policy is still loading. Please wait."
117
  try:
 
125
  except Exception as e:
126
  return f"Error: {str(e)}"
127
 
128
+ # -------------------- Gradio Interface --------------------
129
  status_text = "Loading..."
130
 
131
  with gr.Blocks() as demo: