srikol commited on
Commit
20da24e
Β·
verified Β·
1 Parent(s): 873c8fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -28
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import os, re, faiss, zipfile, warnings, gradio as gr
2
  from pathlib import Path
3
  from typing import List
@@ -6,13 +9,12 @@ from PyPDF2 import PdfReader
6
  from docx import Document
7
  from docx.opc.exceptions import PackageNotFoundError
8
  from openai import OpenAI
9
- from openai import OpenAIError
10
 
11
  # ───────── 0. rΓ©sumΓ© β†’ plain-text ──────────────────────────────────────
12
- FILE = Path(__file__).parent / "my_resume.pdf"
13
 
14
  def read_pdf(p: Path) -> str:
15
- return " ".join((pg.extract_text() or "") for pg in PdfReader(p).pages)
16
 
17
  def read_docx(p: Path) -> str:
18
  return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
@@ -26,12 +28,18 @@ except (PackageNotFoundError, KeyError, zipfile.BadZipFile):
26
  text = re.sub(r"\s+", " ", raw).strip()
27
 
28
  # ───────── 0-bis. extra searchable metadata ───────────────────────────
29
- LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">LinkedIn Profile</a>'
30
- BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">Technical Blog</a>'
31
- ARCH_NOTE = (
32
- '<b>ARCHITECTURE NOTE</b> – The bot follows a Retrieval-Augmented Generation (RAG) design: PDF β†’ 180-token chunks β†’ MiniLM-L6 embeddings β†’ FAISS similarity search β†’ GPT-3.5-turbo answer constrained to context.'
 
 
 
 
33
  )
34
- text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_NOTE}"
 
 
35
 
36
  # ───────── 1. text β†’ embeddings β†’ FAISS ───────────────────────────────
37
  def chunkify(t: str, max_tok: int = 180) -> List[str]:
@@ -46,13 +54,12 @@ def chunkify(t: str, max_tok: int = 180) -> List[str]:
46
  return out
47
 
48
  CHUNKS = chunkify(text)
 
49
  embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
50
- try:
51
- vecs = embedder.encode(CHUNKS, convert_to_numpy=True)
52
- except Exception as e:
53
- raise RuntimeError("Embedding model failed to encode rΓ©sumΓ©.") from e
54
  faiss.normalize_L2(vecs)
55
  index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
 
56
  def retrieve(q: str, k: int = 4):
57
  qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
58
  sims, idx = index.search(qv, k)
@@ -65,8 +72,10 @@ SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
65
  "If missing, reply exactly: \"I don't know based on the resume.\"")
66
 
67
  def overlap(a: str, b: str) -> bool:
68
- return bool(set(re.findall(r"\w+", a.lower())) & set(re.findall(r"\w+", b.lower())))
 
69
 
 
70
  SAFE = {"experience","project","certification","certifications","education",
71
  "skill","skills","summary","company","companies","role","linkedin",
72
  "website","blog","portfolio","architecture"}
@@ -75,34 +84,41 @@ STATIC_ANSWERS = {
75
  "linkedin": LINK_MD,
76
  "linked-in": LINK_MD,
77
  "blog": BLOG_MD,
78
- "architecture": ARCH_NOTE
79
  }
80
 
 
81
  def generate(msg: str) -> str:
82
  lower_msg = msg.lower().strip()
 
 
83
  for key, val in STATIC_ANSWERS.items():
84
  if key in lower_msg:
85
  return val
 
 
86
  if not (SAFE & set(re.findall(r"\w+", lower_msg))):
87
  return "Please ask something related to my rΓ©sumΓ©."
 
 
88
  sims, ctxs = retrieve(msg)
89
  min_sim = 0.10 if len(msg.split()) < 3 else 0.25
90
  if max(sims) < min_sim:
91
  return "I don't know based on the resume."
 
 
92
  ctx = "\n".join(ctxs)
93
- try:
94
- ans = client.chat.completions.create(
95
- model=MODEL,
96
- messages=[
97
- {"role": "system", "content": SYSTEM},
98
- {"role": "user", "content": f"Context:\n{ctx}"},
99
- {"role": "user", "content": f"Question: {msg}"}
100
- ],
101
- max_tokens=256,
102
- temperature=0.2
103
- ).choices[0].message.content.strip()
104
- except OpenAIError:
105
- return "OpenAI API error. Please try again."
106
  return ans if overlap(ans, ctx) else "I don't know based on the resume."
107
 
108
  # ───────── 3. Gradio UI ────────────────────────────────────────────────
@@ -118,10 +134,11 @@ with gr.Blocks(theme="soft") as demo:
118
  btns = [gr.Button(q) for q in quick]
119
 
120
  with gr.Column(scale=4):
121
- chat = gr.Chatbot(type="messages", label="RΓ©sumΓ© Bot", height=520, render_markdown=True)
122
  inp = gr.Textbox(placeholder="Ask about my rΓ©sumé…", show_label=False)
123
  state = gr.State([])
124
 
 
125
  def user_submit(msg, hist):
126
  ans = generate(msg)
127
  hist = hist + [{"role":"user","content":msg},
@@ -130,6 +147,7 @@ with gr.Blocks(theme="soft") as demo:
130
 
131
  inp.submit(user_submit, [inp, state], [inp, chat, state])
132
 
 
133
  def quick_send(hist, q):
134
  ans = generate(q)
135
  hist = hist + [{"role":"user","content":q},
@@ -140,4 +158,5 @@ with gr.Blocks(theme="soft") as demo:
140
  b.click(quick_send, [state, q], [chat, state])
141
 
142
  if __name__ == "__main__":
 
143
  demo.launch(share=True)
 
1
+ # app.py ────────────────────────────────────────────────────────────────
2
+ # Pin Gradio ≀ 3.31.0 in requirements.txt so <a target="_blank"> is kept
3
+ # and place architecture.png beside this file.
4
  import os, re, faiss, zipfile, warnings, gradio as gr
5
  from pathlib import Path
6
  from typing import List
 
9
  from docx import Document
10
  from docx.opc.exceptions import PackageNotFoundError
11
  from openai import OpenAI
 
12
 
13
  # ───────── 0. rΓ©sumΓ© β†’ plain-text ──────────────────────────────────────
14
+ FILE = Path("my_resume.pdf")
15
 
16
  def read_pdf(p: Path) -> str:
17
+ return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
18
 
19
  def read_docx(p: Path) -> str:
20
  return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
 
28
  text = re.sub(r"\s+", " ", raw).strip()
29
 
30
  # ───────── 0-bis. extra searchable metadata ───────────────────────────
31
+ LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">' \
32
+ 'LinkedIn Profile</a>'
33
+ BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">' \
34
+ 'Technical Blog</a>'
35
+ ARCH_MD = (
36
+ "ARCHITECTURE NOTE – The bot follows a Retrieval-Augmented Generation "
37
+ "(RAG) design: PDF β†’ 180-token chunks β†’ MiniLM-L6 embeddings β†’ FAISS "
38
+ "similarity search β†’ GPT-3.5-turbo answer constrained to context."
39
  )
40
+
41
+ # make them retrievable by the RAG index (even though we’ll short-circuit)
42
+ text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_MD}"
43
 
44
  # ───────── 1. text β†’ embeddings β†’ FAISS ───────────────────────────────
45
  def chunkify(t: str, max_tok: int = 180) -> List[str]:
 
54
  return out
55
 
56
  CHUNKS = chunkify(text)
57
+
58
  embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
59
+ vecs = embedder.encode(CHUNKS, convert_to_numpy=True)
 
 
 
60
  faiss.normalize_L2(vecs)
61
  index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
62
+
63
  def retrieve(q: str, k: int = 4):
64
  qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
65
  sims, idx = index.search(qv, k)
 
72
  "If missing, reply exactly: \"I don't know based on the resume.\"")
73
 
74
  def overlap(a: str, b: str) -> bool:
75
+ return bool(set(re.findall(r"\w+", a.lower())) &
76
+ set(re.findall(r"\w+", b.lower())))
77
 
78
+ # ───────── 2-bis. guard words & static answers ─────────────────────────
79
  SAFE = {"experience","project","certification","certifications","education",
80
  "skill","skills","summary","company","companies","role","linkedin",
81
  "website","blog","portfolio","architecture"}
 
84
  "linkedin": LINK_MD,
85
  "linked-in": LINK_MD,
86
  "blog": BLOG_MD,
87
+ "architecture": ARCH_MD
88
  }
89
 
90
+ # ───────── 2-ter. generator ───────────────────────────────────────────
91
  def generate(msg: str) -> str:
92
  lower_msg = msg.lower().strip()
93
+
94
+ # A. serve static responses verbatim
95
  for key, val in STATIC_ANSWERS.items():
96
  if key in lower_msg:
97
  return val
98
+
99
+ # B. resume-related check
100
  if not (SAFE & set(re.findall(r"\w+", lower_msg))):
101
  return "Please ask something related to my rΓ©sumΓ©."
102
+
103
+ # C. retrieve
104
  sims, ctxs = retrieve(msg)
105
  min_sim = 0.10 if len(msg.split()) < 3 else 0.25
106
  if max(sims) < min_sim:
107
  return "I don't know based on the resume."
108
+
109
+ # D. GPT-3.5-turbo
110
  ctx = "\n".join(ctxs)
111
+ ans = client.chat.completions.create(
112
+ model=MODEL,
113
+ messages=[
114
+ {"role": "system", "content": SYSTEM},
115
+ {"role": "user", "content": f"Context:\n{ctx}"},
116
+ {"role": "user", "content": f"Question: {msg}"}
117
+ ],
118
+ max_tokens=256,
119
+ temperature=0.2
120
+ ).choices[0].message.content.strip()
121
+
 
 
122
  return ans if overlap(ans, ctx) else "I don't know based on the resume."
123
 
124
  # ───────── 3. Gradio UI ────────────────────────────────────────────────
 
134
  btns = [gr.Button(q) for q in quick]
135
 
136
  with gr.Column(scale=4):
137
+ chat = gr.Chatbot(type="messages", label="RΓ©sumΓ© Bot", height=520)
138
  inp = gr.Textbox(placeholder="Ask about my rΓ©sumé…", show_label=False)
139
  state = gr.State([])
140
 
141
+ # ENTER
142
  def user_submit(msg, hist):
143
  ans = generate(msg)
144
  hist = hist + [{"role":"user","content":msg},
 
147
 
148
  inp.submit(user_submit, [inp, state], [inp, chat, state])
149
 
150
+ # QUICK buttons
151
  def quick_send(hist, q):
152
  ans = generate(q)
153
  hist = hist + [{"role":"user","content":q},
 
158
  b.click(quick_send, [state, q], [chat, state])
159
 
160
  if __name__ == "__main__":
161
+ # When running in HF Spaces, share=True is ignored; safe to leave as-is.
162
  demo.launch(share=True)