Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,22 @@
|
|
1 |
-
# app.py
|
2 |
-
#
|
3 |
-
#
|
4 |
import os, re, faiss, zipfile, warnings, gradio as gr
|
5 |
from pathlib import Path
|
6 |
-
from typing
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
-
from PyPDF2
|
9 |
-
from docx
|
10 |
from docx.opc.exceptions import PackageNotFoundError
|
11 |
-
from openai
|
12 |
|
13 |
-
# βββββββββ 0. rΓ©sumΓ© β text
|
14 |
-
FILE = Path("my_resume.pdf")
|
15 |
|
16 |
-
def read_pdf(p: Path)->str:
|
17 |
return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
|
18 |
|
19 |
-
def read_docx(p: Path)->str:
|
20 |
return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
|
21 |
|
22 |
try:
|
@@ -27,84 +27,102 @@ except (PackageNotFoundError, KeyError, zipfile.BadZipFile):
|
|
27 |
|
28 |
text = re.sub(r"\s+", " ", raw).strip()
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
ARCH_NOTE = (
|
36 |
"ARCHITECTURE NOTE β The bot follows a Retrieval-Augmented Generation "
|
37 |
"(RAG) design: PDF β 180-token chunks β MiniLM-L6 embeddings β FAISS "
|
38 |
-
"similarity search β GPT-3.5-turbo answer constrained to
|
39 |
)
|
|
|
40 |
|
41 |
-
#
|
42 |
-
text += f" LinkedIn: {
|
43 |
|
44 |
-
# βββββββββ 1.
|
45 |
-
def chunkify(t: str, max_tok: int = 180)->List[str]:
|
46 |
out, buf, n = [], [], 0
|
47 |
for s in re.split(r"(?<=[.!?])\s+", t):
|
48 |
w = len(s.split())
|
49 |
if n + w > max_tok:
|
50 |
out.append(" ".join(buf)); buf, n = [], 0
|
51 |
buf.append(s); n += w
|
52 |
-
if buf:
|
|
|
53 |
return out
|
54 |
|
55 |
CHUNKS = chunkify(text)
|
56 |
|
57 |
-
|
58 |
-
vecs
|
59 |
faiss.normalize_L2(vecs)
|
60 |
index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
|
61 |
|
62 |
-
def retrieve(q: str, k:int=4):
|
63 |
-
qv =
|
64 |
sims, idx = index.search(qv, k)
|
65 |
return sims[0], [CHUNKS[i] for i in idx[0]]
|
66 |
|
67 |
-
# βββββββββ 2. OpenAI client
|
68 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
69 |
MODEL = "gpt-3.5-turbo-0125"
|
70 |
-
|
71 |
SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
|
72 |
"If missing, reply exactly: \"I don't know based on the resume.\"")
|
73 |
|
74 |
-
def overlap(a: str, b: str)->bool:
|
75 |
return bool(set(re.findall(r"\w+", a.lower())) &
|
76 |
set(re.findall(r"\w+", b.lower())))
|
77 |
|
|
|
78 |
SAFE = {"experience","project","certification","certifications","education",
|
79 |
"skill","skills","summary","company","companies","role","linkedin",
|
80 |
"website","blog","portfolio","architecture"}
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
return "Please ask something related to my rΓ©sumΓ©."
|
86 |
|
87 |
-
#
|
88 |
sims, ctxs = retrieve(msg)
|
89 |
-
|
90 |
-
if max(sims) <
|
91 |
return "I don't know based on the resume."
|
92 |
|
93 |
-
#
|
94 |
-
ctx
|
95 |
-
ans
|
96 |
model=MODEL,
|
97 |
messages=[
|
98 |
-
{"role":"system","content":SYSTEM},
|
99 |
-
{"role":"user",
|
100 |
-
{"role":"user",
|
101 |
],
|
102 |
-
max_tokens=256,
|
|
|
103 |
).choices[0].message.content.strip()
|
104 |
|
105 |
return ans if overlap(ans, ctx) else "I don't know based on the resume."
|
106 |
|
107 |
-
# βββββββββ 3. Gradio UI
|
108 |
quick = [
|
109 |
"Professional Summary","Education details","Experience",
|
110 |
"Certifications","Skills","LinkedIn","Blog","Architecture"
|
@@ -121,7 +139,7 @@ with gr.Blocks(theme="soft") as demo:
|
|
121 |
inp = gr.Textbox(placeholder="Ask about my rΓ©sumΓ©β¦", show_label=False)
|
122 |
state = gr.State([])
|
123 |
|
124 |
-
# ENTER
|
125 |
def user_submit(msg, hist):
|
126 |
ans = generate(msg)
|
127 |
hist = hist + [{"role":"user","content":msg},
|
@@ -132,7 +150,7 @@ with gr.Blocks(theme="soft") as demo:
|
|
132 |
|
133 |
# QUICK buttons
|
134 |
def quick_send(hist, q):
|
135 |
-
ans
|
136 |
hist = hist + [{"role":"user","content":q},
|
137 |
{"role":"assistant","content":ans}]
|
138 |
return hist, hist
|
@@ -141,4 +159,5 @@ with gr.Blocks(theme="soft") as demo:
|
|
141 |
b.click(quick_send, [state, gr.State(q)], [chat, state])
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
-
|
|
|
|
1 |
+
# app.py ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
2 |
+
# Pin Gradio β€ 3.31.0 in requirements.txt so <a target="_blank"> is kept
|
3 |
+
# and place architecture.png beside this file.
|
4 |
import os, re, faiss, zipfile, warnings, gradio as gr
|
5 |
from pathlib import Path
|
6 |
+
from typing import List
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
+
from docx import Document
|
10 |
from docx.opc.exceptions import PackageNotFoundError
|
11 |
+
from openai import OpenAI
|
12 |
|
13 |
+
# βββββββββ 0. rΓ©sumΓ© β plain-text ββββββββββββββββββββββββββββββββββββββ
|
14 |
+
FILE = Path("my_resume.pdf")
|
15 |
|
16 |
+
def read_pdf(p: Path) -> str:
|
17 |
return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
|
18 |
|
19 |
+
def read_docx(p: Path) -> str:
|
20 |
return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
|
21 |
|
22 |
try:
|
|
|
27 |
|
28 |
text = re.sub(r"\s+", " ", raw).strip()
|
29 |
|
30 |
+
# βββββββββ 0-bis. extra searchable metadata βββββββββββββββββββββββββββ
|
31 |
+
LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">' \
|
32 |
+
'LinkedIn Profile</a>'
|
33 |
+
BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">' \
|
34 |
+
'Technical Blog</a>'
|
35 |
ARCH_NOTE = (
|
36 |
"ARCHITECTURE NOTE β The bot follows a Retrieval-Augmented Generation "
|
37 |
"(RAG) design: PDF β 180-token chunks β MiniLM-L6 embeddings β FAISS "
|
38 |
+
"similarity search β GPT-3.5-turbo answer constrained to context."
|
39 |
)
|
40 |
+
ARCH_MD = f"{ARCH_NOTE}\n\n"
|
41 |
|
42 |
+
# make them retrievable by the RAG index (even though weβll short-circuit)
|
43 |
+
text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_MD}"
|
44 |
|
45 |
+
# βββββββββ 1. text β embeddings β FAISS βββββββββββββββββββββββββββββββ
|
46 |
+
def chunkify(t: str, max_tok: int = 180) -> List[str]:
|
47 |
out, buf, n = [], [], 0
|
48 |
for s in re.split(r"(?<=[.!?])\s+", t):
|
49 |
w = len(s.split())
|
50 |
if n + w > max_tok:
|
51 |
out.append(" ".join(buf)); buf, n = [], 0
|
52 |
buf.append(s); n += w
|
53 |
+
if buf:
|
54 |
+
out.append(" ".join(buf))
|
55 |
return out
|
56 |
|
57 |
CHUNKS = chunkify(text)
|
58 |
|
59 |
+
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
60 |
+
vecs = embedder.encode(CHUNKS, convert_to_numpy=True)
|
61 |
faiss.normalize_L2(vecs)
|
62 |
index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
|
63 |
|
64 |
+
def retrieve(q: str, k: int = 4):
|
65 |
+
qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
|
66 |
sims, idx = index.search(qv, k)
|
67 |
return sims[0], [CHUNKS[i] for i in idx[0]]
|
68 |
|
69 |
+
# βββββββββ 2. OpenAI client ββββββββββββββββββββββββββββββββββββββββββββ
|
70 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
71 |
MODEL = "gpt-3.5-turbo-0125"
|
|
|
72 |
SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
|
73 |
"If missing, reply exactly: \"I don't know based on the resume.\"")
|
74 |
|
75 |
+
def overlap(a: str, b: str) -> bool:
|
76 |
return bool(set(re.findall(r"\w+", a.lower())) &
|
77 |
set(re.findall(r"\w+", b.lower())))
|
78 |
|
79 |
+
# βββββββββ 2-bis. guard words & static answers βββββββββββββββββββββββββ
|
80 |
SAFE = {"experience","project","certification","certifications","education",
|
81 |
"skill","skills","summary","company","companies","role","linkedin",
|
82 |
"website","blog","portfolio","architecture"}
|
83 |
|
84 |
+
STATIC_ANSWERS = {
|
85 |
+
"linkedin": LINK_MD,
|
86 |
+
"linked-in": LINK_MD,
|
87 |
+
"blog": BLOG_MD,
|
88 |
+
"architecture": ARCH_MD
|
89 |
+
}
|
90 |
+
|
91 |
+
# βββββββββ 2-ter. generator βββββββββββββββββββββββββββββββββββββββββββ
|
92 |
+
def generate(msg: str) -> str:
|
93 |
+
lower_msg = msg.lower().strip()
|
94 |
+
|
95 |
+
# A. serve static responses verbatim
|
96 |
+
for key, val in STATIC_ANSWERS.items():
|
97 |
+
if key in lower_msg:
|
98 |
+
return val
|
99 |
+
|
100 |
+
# B. resume-related check
|
101 |
+
if not (SAFE & set(re.findall(r"\w+", lower_msg))):
|
102 |
return "Please ask something related to my rΓ©sumΓ©."
|
103 |
|
104 |
+
# C. retrieve
|
105 |
sims, ctxs = retrieve(msg)
|
106 |
+
min_sim = 0.10 if len(msg.split()) < 3 else 0.25
|
107 |
+
if max(sims) < min_sim:
|
108 |
return "I don't know based on the resume."
|
109 |
|
110 |
+
# D. GPT-3.5-turbo
|
111 |
+
ctx = "\n".join(ctxs)
|
112 |
+
ans = client.chat.completions.create(
|
113 |
model=MODEL,
|
114 |
messages=[
|
115 |
+
{"role": "system", "content": SYSTEM},
|
116 |
+
{"role": "user", "content": f"Context:\n{ctx}"},
|
117 |
+
{"role": "user", "content": f"Question: {msg}"}
|
118 |
],
|
119 |
+
max_tokens=256,
|
120 |
+
temperature=0.2
|
121 |
).choices[0].message.content.strip()
|
122 |
|
123 |
return ans if overlap(ans, ctx) else "I don't know based on the resume."
|
124 |
|
125 |
+
# βββββββββ 3. Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββ
|
126 |
quick = [
|
127 |
"Professional Summary","Education details","Experience",
|
128 |
"Certifications","Skills","LinkedIn","Blog","Architecture"
|
|
|
139 |
inp = gr.Textbox(placeholder="Ask about my rΓ©sumΓ©β¦", show_label=False)
|
140 |
state = gr.State([])
|
141 |
|
142 |
+
# ENTER
|
143 |
def user_submit(msg, hist):
|
144 |
ans = generate(msg)
|
145 |
hist = hist + [{"role":"user","content":msg},
|
|
|
150 |
|
151 |
# QUICK buttons
|
152 |
def quick_send(hist, q):
|
153 |
+
ans = generate(q)
|
154 |
hist = hist + [{"role":"user","content":q},
|
155 |
{"role":"assistant","content":ans}]
|
156 |
return hist, hist
|
|
|
159 |
b.click(quick_send, [state, gr.State(q)], [chat, state])
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
+
# When running in HF Spaces, share=True is ignored; safe to leave as-is.
|
163 |
+
demo.launch(share=True)
|