File size: 6,834 Bytes
20da24e c9419d3 6c2bf18 4f65160 ca3721d 4f65160 12dd2a6 4f65160 6c2bf18 e4198a1 20da24e 12dd2a6 4f65160 20da24e 12dd2a6 4f65160 c9419d3 f5208cd e4198a1 20da24e f29d72e 20da24e 6f59468 e4198a1 4f65160 f5208cd c9419d3 4f65160 f5208cd 20da24e 4f65160 20da24e f5208cd c9419d3 20da24e 4f65160 c9419d3 90e3639 6c2bf18 e4198a1 f5208cd c9419d3 6c2bf18 4f65160 20da24e c9419d3 20da24e c9419d3 ea61695 4f65160 20da24e 4f65160 20da24e 4f65160 20da24e 4f65160 20da24e 4f65160 ca3721d 20da24e f5208cd 4f65160 ca3721d 20da24e 4f65160 20da24e c9419d3 e4198a1 5584643 c9419d3 5584643 ea61695 4da3146 f5208cd ca3721d f5208cd c9419d3 ca3721d 20da24e e4198a1 80e95d4 952ac40 c9419d3 20da24e e4198a1 f5208cd ea61695 e4198a1 f5208cd 952ac40 4da3146 952ac40 ea61695 e4198a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# app.py ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Pin Gradio β€ 3.31.0 in requirements.txt so <a target="_blank"> is kept
# and place architecture.png beside this file.
import os, re, faiss, zipfile, warnings, gradio as gr
from pathlib import Path
from typing import List
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document
from docx.opc.exceptions import PackageNotFoundError
from openai import OpenAI
# βββββββββ 0. rΓ©sumΓ© β plain-text ββββββββββββββββββββββββββββββββββββββ
FILE = Path("my_resume.pdf")
def read_pdf(p: Path) -> str:
return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages)
def read_docx(p: Path) -> str:
return " ".join(par.text for par in Document(p).paragraphs if par.text.strip())
try:
raw = read_docx(FILE)
except (PackageNotFoundError, KeyError, zipfile.BadZipFile):
warnings.warn("Reading rΓ©sumΓ© as PDF")
raw = read_pdf(FILE)
text = re.sub(r"\s+", " ", raw).strip()
# βββββββββ 0-bis. extra searchable metadata βββββββββββββββββββββββββββ
LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">' \
'LinkedIn Profile</a>'
BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">' \
'Technical Blog</a>'
ARCH_MD = (
"ARCHITECTURE NOTE β The bot follows a Retrieval-Augmented Generation "
"(RAG) design: PDF β 180-token chunks β MiniLM-L6 embeddings β FAISS "
"similarity search β GPT-3.5-turbo answer constrained to context."
)
# make them retrievable by the RAG index (even though weβll short-circuit)
text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_MD}"
# βββββββββ 1. text β embeddings β FAISS βββββββββββββββββββββββββββββββ
def chunkify(t: str, max_tok: int = 180) -> List[str]:
out, buf, n = [], [], 0
for s in re.split(r"(?<=[.!?])\s+", t):
w = len(s.split())
if n + w > max_tok:
out.append(" ".join(buf)); buf, n = [], 0
buf.append(s); n += w
if buf:
out.append(" ".join(buf))
return out
CHUNKS = chunkify(text)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
vecs = embedder.encode(CHUNKS, convert_to_numpy=True)
faiss.normalize_L2(vecs)
index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
def retrieve(q: str, k: int = 4):
qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv)
sims, idx = index.search(qv, k)
return sims[0], [CHUNKS[i] for i in idx[0]]
# βββββββββ 2. OpenAI client ββββββββββββββββββββββββββββββββββββββββββββ
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
MODEL = "gpt-3.5-turbo-0125"
SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
"If missing, reply exactly: \"I don't know based on the resume.\"")
def overlap(a: str, b: str) -> bool:
return bool(set(re.findall(r"\w+", a.lower())) &
set(re.findall(r"\w+", b.lower())))
# βββββββββ 2-bis. guard words & static answers βββββββββββββββββββββββββ
SAFE = {"experience","project","certification","certifications","education",
"skill","skills","summary","company","companies","role","linkedin",
"website","blog","portfolio","architecture"}
STATIC_ANSWERS = {
"linkedin": LINK_MD,
"linked-in": LINK_MD,
"blog": BLOG_MD,
"architecture": ARCH_MD
}
# βββββββββ 2-ter. generator βββββββββββββββββββββββββββββββββββββββββββ
def generate(msg: str) -> str:
lower_msg = msg.lower().strip()
# A. serve static responses verbatim
for key, val in STATIC_ANSWERS.items():
if key in lower_msg:
return val
# B. resume-related check
if not (SAFE & set(re.findall(r"\w+", lower_msg))):
return "Please ask something related to my rΓ©sumΓ©."
# C. retrieve
sims, ctxs = retrieve(msg)
min_sim = 0.10 if len(msg.split()) < 3 else 0.25
if max(sims) < min_sim:
return "I don't know based on the resume."
# D. GPT-3.5-turbo
ctx = "\n".join(ctxs)
ans = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": SYSTEM},
{"role": "user", "content": f"Context:\n{ctx}"},
{"role": "user", "content": f"Question: {msg}"}
],
max_tokens=256,
temperature=0.2
).choices[0].message.content.strip()
return ans if overlap(ans, ctx) else "I don't know based on the resume."
# βββββββββ 3. Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββ
quick = [
"Professional Summary","Education details","Experience",
"Certifications","Skills","LinkedIn","Blog","Architecture"
]
def set_quick_q(q):
return q
with gr.Blocks(theme="soft") as demo:
with gr.Row(equal_height=True):
with gr.Column(min_width=170, scale=0):
gr.Markdown("### Quick questions")
btns = [gr.Button(q) for q in quick]
with gr.Column(scale=4):
chat = gr.Chatbot(type="messages", label="RΓ©sumΓ© Bot", height=520)
inp = gr.Textbox(placeholder="Ask about my rΓ©sumΓ©β¦", show_label=False)
state = gr.State([])
quick_q = gr.Textbox(visible=False) # Hidden textbox for quick button input
# ENTER
def user_submit(msg, hist):
ans = generate(msg)
hist = hist + [{"role":"user","content":msg},
{"role":"assistant","content":ans}]
return "", hist, hist
inp.submit(user_submit, [inp, state], [inp, chat, state])
def quick_send(hist, q):
ans = generate(q)
hist = hist + [{"role":"user","content":q},
{"role":"assistant","content":ans}]
return hist, hist
for b, q in zip(btns, quick):
b.click(
fn=lambda v=None, val=q: val, # sets quick_q value to q
inputs=None,
outputs=quick_q
).then(
quick_send, [state, quick_q], [chat, state]
)
if __name__ == "__main__":
demo.launch(share=True)
|