|
|
|
|
|
|
|
import os, re, faiss, zipfile, warnings, gradio as gr |
|
from pathlib import Path |
|
from typing import List |
|
from sentence_transformers import SentenceTransformer |
|
from PyPDF2 import PdfReader |
|
from docx import Document |
|
from docx.opc.exceptions import PackageNotFoundError |
|
from openai import OpenAI |
|
|
|
|
|
FILE = Path("my_resume.pdf") |
|
|
|
def read_pdf(p: Path) -> str: |
|
return " ".join(pg.extract_text() or "" for pg in PdfReader(p).pages) |
|
|
|
def read_docx(p: Path) -> str: |
|
return " ".join(par.text for par in Document(p).paragraphs if par.text.strip()) |
|
|
|
try: |
|
raw = read_docx(FILE) |
|
except (PackageNotFoundError, KeyError, zipfile.BadZipFile): |
|
warnings.warn("Reading rΓ©sumΓ© as PDF") |
|
raw = read_pdf(FILE) |
|
|
|
text = re.sub(r"\s+", " ", raw).strip() |
|
|
|
|
|
LINK_MD = '<a href="https://www.linkedin.com/in/sriharideep/" target="_blank">' \ |
|
'LinkedIn Profile</a>' |
|
BLOG_MD = '<a href="https://sfdcbrewery.github.io/" target="_blank">' \ |
|
'Technical Blog</a>' |
|
ARCH_MD = ( |
|
"ARCHITECTURE NOTE β The bot follows a Retrieval-Augmented Generation " |
|
"(RAG) design: PDF β 180-token chunks β MiniLM-L6 embeddings β FAISS " |
|
"similarity search β GPT-3.5-turbo answer constrained to context." |
|
) |
|
|
|
|
|
text += f" LinkedIn: {LINK_MD} Blog: {BLOG_MD} {ARCH_MD}" |
|
|
|
|
|
def chunkify(t: str, max_tok: int = 180) -> List[str]: |
|
out, buf, n = [], [], 0 |
|
for s in re.split(r"(?<=[.!?])\s+", t): |
|
w = len(s.split()) |
|
if n + w > max_tok: |
|
out.append(" ".join(buf)); buf, n = [], 0 |
|
buf.append(s); n += w |
|
if buf: |
|
out.append(" ".join(buf)) |
|
return out |
|
|
|
CHUNKS = chunkify(text) |
|
|
|
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
vecs = embedder.encode(CHUNKS, convert_to_numpy=True) |
|
faiss.normalize_L2(vecs) |
|
index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs) |
|
|
|
def retrieve(q: str, k: int = 4): |
|
qv = embedder.encode([q], convert_to_numpy=True); faiss.normalize_L2(qv) |
|
sims, idx = index.search(qv, k) |
|
return sims[0], [CHUNKS[i] for i in idx[0]] |
|
|
|
|
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
MODEL = "gpt-3.5-turbo-0125" |
|
SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. " |
|
"If missing, reply exactly: \"I don't know based on the resume.\"") |
|
|
|
def overlap(a: str, b: str) -> bool: |
|
return bool(set(re.findall(r"\w+", a.lower())) & |
|
set(re.findall(r"\w+", b.lower()))) |
|
|
|
|
|
SAFE = {"experience","project","certification","certifications","education", |
|
"skill","skills","summary","company","companies","role","linkedin", |
|
"website","blog","portfolio","architecture"} |
|
|
|
STATIC_ANSWERS = { |
|
"linkedin": LINK_MD, |
|
"linked-in": LINK_MD, |
|
"blog": BLOG_MD, |
|
"architecture": ARCH_MD |
|
} |
|
|
|
|
|
def generate(msg: str) -> str: |
|
lower_msg = msg.lower().strip() |
|
|
|
|
|
for key, val in STATIC_ANSWERS.items(): |
|
if key in lower_msg: |
|
return val |
|
|
|
|
|
if not (SAFE & set(re.findall(r"\w+", lower_msg))): |
|
return "Please ask something related to my rΓ©sumΓ©." |
|
|
|
|
|
sims, ctxs = retrieve(msg) |
|
min_sim = 0.10 if len(msg.split()) < 3 else 0.25 |
|
if max(sims) < min_sim: |
|
return "I don't know based on the resume." |
|
|
|
|
|
ctx = "\n".join(ctxs) |
|
ans = client.chat.completions.create( |
|
model=MODEL, |
|
messages=[ |
|
{"role": "system", "content": SYSTEM}, |
|
{"role": "user", "content": f"Context:\n{ctx}"}, |
|
{"role": "user", "content": f"Question: {msg}"} |
|
], |
|
max_tokens=256, |
|
temperature=0.2 |
|
).choices[0].message.content.strip() |
|
|
|
return ans if overlap(ans, ctx) else "I don't know based on the resume." |
|
|
|
|
|
quick = [ |
|
"Professional Summary","Education details","Experience", |
|
"Certifications","Skills","LinkedIn","Blog","Architecture" |
|
] |
|
|
|
def set_quick_q(q): |
|
return q |
|
|
|
with gr.Blocks(theme="soft") as demo: |
|
with gr.Row(equal_height=True): |
|
with gr.Column(min_width=170, scale=0): |
|
gr.Markdown("### Quick questions") |
|
btns = [gr.Button(q) for q in quick] |
|
|
|
with gr.Column(scale=4): |
|
chat = gr.Chatbot(type="messages", label="RΓ©sumΓ© Bot", height=520) |
|
inp = gr.Textbox(placeholder="Ask about my rΓ©sumΓ©β¦", show_label=False) |
|
state = gr.State([]) |
|
quick_q = gr.Textbox(visible=False) |
|
|
|
|
|
def user_submit(msg, hist): |
|
ans = generate(msg) |
|
hist = hist + [{"role":"user","content":msg}, |
|
{"role":"assistant","content":ans}] |
|
return "", hist, hist |
|
|
|
inp.submit(user_submit, [inp, state], [inp, chat, state]) |
|
|
|
def quick_send(hist, q): |
|
ans = generate(q) |
|
hist = hist + [{"role":"user","content":q}, |
|
{"role":"assistant","content":ans}] |
|
return hist, hist |
|
|
|
for b, q in zip(btns, quick): |
|
b.click( |
|
fn=lambda v=None, val=q: val, |
|
inputs=None, |
|
outputs=quick_q |
|
).then( |
|
quick_send, [state, quick_q], [chat, state] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|