srikol commited on
Commit
4f65160
Β·
verified Β·
1 Parent(s): 79f0790

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -46
app.py CHANGED
@@ -1,22 +1,22 @@
1
- # app.py ──────────────────────────────────────────────────────────────
2
- # Requires: gradio<=3.31.0 (later versions strip target="_blank")
3
- # architecture.png (diagram you supply) ─ place beside app.py
4
  import os, re, faiss, zipfile, warnings, gradio as gr
5
  from pathlib import Path
6
- from typing import List
7
  from sentence_transformers import SentenceTransformer
8
- from PyPDF2 import PdfReader
9
- from docx import Document
10
  from docx.opc.exceptions import PackageNotFoundError
11
- from openai import OpenAI
12
 
13
- # ───────── 0. rΓ©sumΓ© β†’ text ───────────────────────────────────────────
14
- FILE = Path("my_resume.pdf") # already in repo
15
 
16
- def read_pdf(p: Path)->str:
17
  return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
18
 
19
- def read_docx(p: Path)->str:
20
  return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
21
 
22
  try:
@@ -27,84 +27,102 @@ except (PackageNotFoundError, KeyError, zipfile.BadZipFile):
27
 
28
  text = re.sub(r"\s+", " ", raw).strip()
29
 
30
- # Extra searchable metadata (Markdown links open in a new tab with ctrl/cmd-click)
31
- LINKEDIN = "[LinkedIn Profile](https://www.linkedin.com/in/sriharideep/)"
32
- BLOG = "[Technical Blog](https://sfdcbrewery.github.io/)"
33
- ARCH_MD = "![Architecture Diagram](architecture.png)"
34
-
35
  ARCH_NOTE = (
36
  "ARCHITECTURE NOTE – The bot follows a Retrieval-Augmented Generation "
37
  "(RAG) design: PDF β†’ 180-token chunks β†’ MiniLM-L6 embeddings β†’ FAISS "
38
- "similarity search β†’ GPT-3.5-turbo answer constrained to retrieved context."
39
  )
 
40
 
41
- # Append so they are embedded and retrievable
42
- text += f" LinkedIn: {LINKEDIN} Blog: {BLOG} {ARCH_NOTE} {ARCH_MD}"
43
 
44
- # ───────── 1. chunk β†’ FAISS ───────────────────────────────────────────
45
- def chunkify(t: str, max_tok: int = 180)->List[str]:
46
  out, buf, n = [], [], 0
47
  for s in re.split(r"(?<=[.!?])\s+", t):
48
  w = len(s.split())
49
  if n + w > max_tok:
50
  out.append(" ".join(buf)); buf, n = [], 0
51
  buf.append(s); n += w
52
- if buf: out.append(" ".join(buf))
 
53
  return out
54
 
55
  CHUNKS = chunkify(text)
56
 
57
- embed = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
58
- vecs = embed.encode(CHUNKS, convert_to_numpy=True)
59
  faiss.normalize_L2(vecs)
60
  index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
61
 
62
- def retrieve(q: str, k:int=4):
63
- qv = embed.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
64
  sims, idx = index.search(qv, k)
65
  return sims[0], [CHUNKS[i] for i in idx[0]]
66
 
67
- # ───────── 2. OpenAI client ───────────────────────────────────────────
68
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
69
  MODEL = "gpt-3.5-turbo-0125"
70
-
71
  SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
72
  "If missing, reply exactly: \"I don't know based on the resume.\"")
73
 
74
- def overlap(a: str, b: str)->bool:
75
  return bool(set(re.findall(r"\w+", a.lower())) &
76
  set(re.findall(r"\w+", b.lower())))
77
 
 
78
  SAFE = {"experience","project","certification","certifications","education",
79
  "skill","skills","summary","company","companies","role","linkedin",
80
  "website","blog","portfolio","architecture"}
81
 
82
- def generate(msg: str)->str:
83
- # 0. Must be rΓ©sumΓ©-related
84
- if not (SAFE & set(re.findall(r"\w+", msg.lower()))):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  return "Please ask something related to my rΓ©sumΓ©."
86
 
87
- # 1. Retrieve
88
  sims, ctxs = retrieve(msg)
89
- thresh = 0.10 if len(msg.split()) < 3 else 0.25
90
- if max(sims) < thresh:
91
  return "I don't know based on the resume."
92
 
93
- # 2. LLM
94
- ctx = "\n".join(ctxs)
95
- ans = client.chat.completions.create(
96
  model=MODEL,
97
  messages=[
98
- {"role":"system","content":SYSTEM},
99
- {"role":"user", "content":f"Context:\n{ctx}"},
100
- {"role":"user", "content":f"Question: {msg}"}
101
  ],
102
- max_tokens=256, temperature=0.2
 
103
  ).choices[0].message.content.strip()
104
 
105
  return ans if overlap(ans, ctx) else "I don't know based on the resume."
106
 
107
- # ───────── 3. Gradio UI ───────────────────────────────────────────────
108
  quick = [
109
  "Professional Summary","Education details","Experience",
110
  "Certifications","Skills","LinkedIn","Blog","Architecture"
@@ -121,7 +139,7 @@ with gr.Blocks(theme="soft") as demo:
121
  inp = gr.Textbox(placeholder="Ask about my rΓ©sumé…", show_label=False)
122
  state = gr.State([])
123
 
124
- # ENTER key
125
  def user_submit(msg, hist):
126
  ans = generate(msg)
127
  hist = hist + [{"role":"user","content":msg},
@@ -132,7 +150,7 @@ with gr.Blocks(theme="soft") as demo:
132
 
133
  # QUICK buttons
134
  def quick_send(hist, q):
135
- ans = generate(q)
136
  hist = hist + [{"role":"user","content":q},
137
  {"role":"assistant","content":ans}]
138
  return hist, hist
@@ -141,4 +159,5 @@ with gr.Blocks(theme="soft") as demo:
141
  b.click(quick_send, [state, gr.State(q)], [chat, state])
142
 
143
  if __name__ == "__main__":
144
- demo.launch(share=True) # remove share=True if not needed
 
 
1
+ # app.py ────────────────────────────────────────────────────────────────
2
+ # Pin Gradio ≀ 3.31.0 in requirements.txt so <a target="_blank"> is kept
3
+ # and place architecture.png beside this file.
4
  import os, re, faiss, zipfile, warnings, gradio as gr
5
  from pathlib import Path
6
+ from typing import List
7
  from sentence_transformers import SentenceTransformer
8
+ from PyPDF2 import PdfReader
9
+ from docx import Document
10
  from docx.opc.exceptions import PackageNotFoundError
11
+ from openai import OpenAI
12
 
13
+ # ───────── 0. rΓ©sumΓ© β†’ plain-text ──────────────────────────────────────
14
+ FILE = Path("my_resume.pdf")
15
 
16
+ def read_pdf(p: Path) -> str:
17
  return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
18
 
19
+ def read_docx(p: Path) -> str:
20
  return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
21
 
22
  try:
 
27
 
28
  text = re.sub(r"\s+", " ", raw).strip()
29
 
30
+ # ───────── 0-bis. extra searchable metadata ───────────────────────────
31
+ LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">' \
32
+ 'LinkedIn Profile</a>'
33
+ BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">' \
34
+ 'Technical Blog</a>'
35
  ARCH_NOTE = (
36
  "ARCHITECTURE NOTE – The bot follows a Retrieval-Augmented Generation "
37
  "(RAG) design: PDF β†’ 180-token chunks β†’ MiniLM-L6 embeddings β†’ FAISS "
38
+ "similarity search β†’ GPT-3.5-turbo answer constrained to context."
39
  )
40
+ ARCH_MD = f"{ARCH_NOTE}\n\n![Architecture Diagram](architecture.png)"
41
 
42
+ # make them retrievable by the RAG index (even though we’ll short-circuit)
43
+ text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_MD}"
44
 
45
+ # ───────── 1. text β†’ embeddings β†’ FAISS ───────────────────────────────
46
+ def chunkify(t: str, max_tok: int = 180) -> List[str]:
47
  out, buf, n = [], [], 0
48
  for s in re.split(r"(?<=[.!?])\s+", t):
49
  w = len(s.split())
50
  if n + w > max_tok:
51
  out.append(" ".join(buf)); buf, n = [], 0
52
  buf.append(s); n += w
53
+ if buf:
54
+ out.append(" ".join(buf))
55
  return out
56
 
57
  CHUNKS = chunkify(text)
58
 
59
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
60
+ vecs = embedder.encode(CHUNKS, convert_to_numpy=True)
61
  faiss.normalize_L2(vecs)
62
  index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
63
 
64
+ def retrieve(q: str, k: int = 4):
65
+ qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
66
  sims, idx = index.search(qv, k)
67
  return sims[0], [CHUNKS[i] for i in idx[0]]
68
 
69
+ # ───────── 2. OpenAI client ────────────────────────────────────────────
70
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
71
  MODEL = "gpt-3.5-turbo-0125"
 
72
  SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
73
  "If missing, reply exactly: \"I don't know based on the resume.\"")
74
 
75
+ def overlap(a: str, b: str) -> bool:
76
  return bool(set(re.findall(r"\w+", a.lower())) &
77
  set(re.findall(r"\w+", b.lower())))
78
 
79
+ # ───────── 2-bis. guard words & static answers ─────────────────────────
80
  SAFE = {"experience","project","certification","certifications","education",
81
  "skill","skills","summary","company","companies","role","linkedin",
82
  "website","blog","portfolio","architecture"}
83
 
84
+ STATIC_ANSWERS = {
85
+ "linkedin": LINK_MD,
86
+ "linked-in": LINK_MD,
87
+ "blog": BLOG_MD,
88
+ "architecture": ARCH_MD
89
+ }
90
+
91
+ # ───────── 2-ter. generator ───────────────────────────────────────────
92
+ def generate(msg: str) -> str:
93
+ lower_msg = msg.lower().strip()
94
+
95
+ # A. serve static responses verbatim
96
+ for key, val in STATIC_ANSWERS.items():
97
+ if key in lower_msg:
98
+ return val
99
+
100
+ # B. resume-related check
101
+ if not (SAFE & set(re.findall(r"\w+", lower_msg))):
102
  return "Please ask something related to my rΓ©sumΓ©."
103
 
104
+ # C. retrieve
105
  sims, ctxs = retrieve(msg)
106
+ min_sim = 0.10 if len(msg.split()) < 3 else 0.25
107
+ if max(sims) < min_sim:
108
  return "I don't know based on the resume."
109
 
110
+ # D. GPT-3.5-turbo
111
+ ctx = "\n".join(ctxs)
112
+ ans = client.chat.completions.create(
113
  model=MODEL,
114
  messages=[
115
+ {"role": "system", "content": SYSTEM},
116
+ {"role": "user", "content": f"Context:\n{ctx}"},
117
+ {"role": "user", "content": f"Question: {msg}"}
118
  ],
119
+ max_tokens=256,
120
+ temperature=0.2
121
  ).choices[0].message.content.strip()
122
 
123
  return ans if overlap(ans, ctx) else "I don't know based on the resume."
124
 
125
+ # ───────── 3. Gradio UI ────────────────────────────────────────────────
126
  quick = [
127
  "Professional Summary","Education details","Experience",
128
  "Certifications","Skills","LinkedIn","Blog","Architecture"
 
139
  inp = gr.Textbox(placeholder="Ask about my rΓ©sumé…", show_label=False)
140
  state = gr.State([])
141
 
142
+ # ENTER
143
  def user_submit(msg, hist):
144
  ans = generate(msg)
145
  hist = hist + [{"role":"user","content":msg},
 
150
 
151
  # QUICK buttons
152
  def quick_send(hist, q):
153
+ ans = generate(q)
154
  hist = hist + [{"role":"user","content":q},
155
  {"role":"assistant","content":ans}]
156
  return hist, hist
 
159
  b.click(quick_send, [state, gr.State(q)], [chat, state])
160
 
161
  if __name__ == "__main__":
162
+ # When running in HF Spaces, share=True is ignored; safe to leave as-is.
163
+ demo.launch(share=True)