Spaces:
Sleeping
Sleeping
import streamlit as st | |
import sqlite3 | |
import pandas as pd | |
import gdown | |
import os | |
from sentence_transformers import SentenceTransformer, CrossEncoder | |
import faiss | |
import numpy as np | |
from wordcloud import WordCloud | |
import altair as alt | |
import textwrap | |
import plotly.express as px | |
from sklearn.cluster import KMeans | |
# Path Constants | |
MAJOR_DB_PATH = "./docs/majors.db" | |
MAP_DB_PATH = "./docs/map.db" | |
JOBS_DB_PATH = "./docs/jobs.db" | |
FAISS_INDEX_PATH = "./docs/jobs_embeddings.faiss" | |
# Secrets | |
JOBS_GDRIVE_URL = os.environ.get("JOB_URL") | |
FAISS_GDRIVE_URL = os.environ.get("FAISS_URL") | |
# Model Constants | |
EMBEDDINGS_MODEL = "mixedbread-ai/mxbai-embed-xsmall-v1" | |
RERANK_MODEL = "mixedbread-ai/mxbai-rerank-xsmall-v1" | |
dimensions=384 | |
# Search Constants | |
MAX_JOB_POSTINGS_FETCH = 100 | |
SEMANTIC_SCORE_SCALE = 100.0 | |
RELEVANCY_THRESHOLD = 0.1 | |
# Ensure job DB exists locally | |
def download_jobs_db(): | |
if not os.path.exists(JOBS_DB_PATH): | |
st.info("Downloading job postings database...") | |
gdown.download(JOBS_GDRIVE_URL, JOBS_DB_PATH, quiet=False) | |
# Ensure FAISS index exists locally | |
def download_faiss_index(): | |
if not os.path.exists(FAISS_INDEX_PATH): | |
st.info("Downloading FAISS index...") | |
gdown.download(FAISS_GDRIVE_URL, FAISS_INDEX_PATH, quiet=False) | |
# Load hierarchical structure from majors.db | |
def load_major_hierarchy(): | |
conn = sqlite3.connect(MAJOR_DB_PATH) | |
df = pd.read_sql( | |
"SELECT DISTINCT School, Department, [Major Name] AS Major, [Degree Level] AS DegreeLevel FROM majors;", | |
conn, | |
) | |
conn.close() | |
return df | |
# Load embedding model | |
def load_embedding_model(): | |
return SentenceTransformer(EMBEDDINGS_MODEL, truncate_dim=dimensions) | |
# Load FAISS index and job ID map | |
def load_faiss_index(): | |
download_faiss_index() | |
index = faiss.read_index(FAISS_INDEX_PATH) | |
return index | |
# Generate embedding for a major | |
def get_major_embedding(major_display: str): | |
""" | |
major_display is of the form "Major Name (DegreeLevel)". | |
We parse out both pieces, lookup description, and encode all three. | |
""" | |
model = load_embedding_model() | |
# 1) parse the display into name & degree | |
if "(" in major_display and major_display.endswith(")"): | |
name, degree = major_display.rsplit("(", 1) | |
name = name.strip() | |
degree = degree[:-1] # drop trailing ")" | |
else: | |
name, degree = major_display, "" | |
# 2) fetch the rich description from majors.db | |
conn = sqlite3.connect(MAJOR_DB_PATH) | |
row = conn.execute( | |
"SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?", | |
(name, degree) | |
).fetchone() | |
conn.close() | |
desc = row[0] if row and row[0] else "" | |
# 3) build the full prompt | |
full_text = f"{name} ({degree}). {desc}" | |
# 4) embed | |
emb = model.encode(full_text, prompt_name="query", convert_to_numpy=True) | |
emb = np.array(emb, dtype='float32') | |
faiss.normalize_L2(emb.reshape(1, -1)) | |
return emb | |
def get_major_query_text(major_display: str) -> str: | |
# parse out name & degree exactly like get_major_embedding | |
if "(" in major_display and major_display.endswith(")"): | |
name, degree = major_display.rsplit("(", 1) | |
name = name.strip() | |
degree = degree[:-1] | |
else: | |
name, degree = major_display, "" | |
# fetch the same description | |
conn = sqlite3.connect(MAJOR_DB_PATH) | |
row = conn.execute( | |
"SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?", | |
(name, degree) | |
).fetchone() | |
conn.close() | |
desc = row[0] if row and row[0] else "" | |
# rebuild the exact query text | |
return f"{name} ({degree}). {desc}" | |
# Perform semantic search using FAISS | |
def perform_semantic_search(major_embedding, _faiss_index, k_results): | |
D, I = _faiss_index.search(major_embedding.reshape(1, -1), k_results) | |
results = [] | |
for idx64, score in zip(I[0], D[0]): | |
if idx64 == -1: | |
continue | |
job_id = int(idx64) # this *is* your job_id | |
results.append({'job_id': job_id, 'semantic_score': float(score)}) | |
return pd.DataFrame(results) | |
# Fetch jobs by ID and calculate relevancy in Python | |
def get_jobs_with_semantic_scores(job_ids_with_scores): | |
if job_ids_with_scores.empty: | |
return pd.DataFrame() | |
job_ids = job_ids_with_scores['job_id'].tolist() | |
placeholders = ','.join(['?' for _ in job_ids]) | |
conn = sqlite3.connect(JOBS_DB_PATH) | |
sql = f"SELECT * FROM job_postings WHERE job_id IN ({placeholders});" | |
jobs_df = pd.read_sql(sql, conn, params=job_ids) | |
conn.close() | |
# Merge with semantic scores | |
merged_df = pd.merge(jobs_df, job_ids_with_scores, left_on='job_id', right_on='job_id', how='inner') | |
# Calculate relevancy score | |
merged_df['relevancy_score'] = merged_df['semantic_score'] * SEMANTIC_SCORE_SCALE | |
# Calculate the 25th percentile of relevancy scores | |
# Only calculate if there are enough scores to make sense, otherwise use a default low threshold | |
if not merged_df.empty and len(merged_df) >= 4: # Ensure at least 4 elements for 25th percentile | |
percentile_threshold = np.percentile(merged_df['relevancy_score'], 25) | |
else: | |
percentile_threshold = 0.0 # Fallback to a very low threshold if not enough data | |
# Filter by the dynamic percentile threshold | |
filtered_df = merged_df[merged_df['relevancy_score'] >= percentile_threshold] | |
# Sort by relevancy score and limit to MAX_JOB_POSTINGS_FETCH | |
sorted_df = filtered_df.sort_values(by='relevancy_score', ascending=False) | |
return sorted_df.head(MAX_JOB_POSTINGS_FETCH) | |
# Run query on jobs.db | |
def query_jobs(sql_query, params): | |
conn = sqlite3.connect(JOBS_DB_PATH) | |
df = pd.read_sql(sql_query, conn, params=params) | |
conn.close() | |
return df | |
# Streamlit UI | |
st.set_page_config( | |
page_title="Major-to-Job Explorer", | |
layout="centered", | |
initial_sidebar_state="expanded" | |
) | |
st.title("🎓 Major-to-Job Postings Explorer") | |
# Download job DB and FAISS index if needed | |
download_jobs_db() | |
download_faiss_index() | |
# Load hierarchy | |
hierarchy_df = load_major_hierarchy() | |
# Step 1: Select School | |
schools = sorted(hierarchy_df["School"].unique()) | |
selected_school = st.selectbox("Select a School:", schools) | |
if selected_school: | |
departments = sorted(hierarchy_df[hierarchy_df["School"] == selected_school]["Department"].unique()) | |
selected_department = st.selectbox("Select a Department:", departments) | |
if selected_department: | |
majors_df = hierarchy_df[ | |
(hierarchy_df["School"] == selected_school) & | |
(hierarchy_df["Department"] == selected_department) | |
].copy() | |
# Create a display string for the selectbox | |
majors_df["Display"] = majors_df["Major"] + " (" + majors_df["DegreeLevel"] + ")" | |
# Create a mapping from display string to actual major name | |
major_display_to_name = dict(zip(majors_df["Display"], majors_df["Major"])) | |
# Sort the display names | |
display_majors = sorted(majors_df["Display"].unique()) | |
selected_major_display = st.selectbox("Select a Major:", display_majors) | |
# Get the actual major name from the display name | |
selected_major = major_display_to_name.get(selected_major_display) | |
search_button = st.button("Search Jobs") | |
if search_button and selected_major: | |
# Reset pagination when a new search is initiated | |
st.session_state.current_page = 0 | |
st.session_state.last_selected_major = selected_major | |
with st.spinner("Loading semantic data..."): | |
faiss_index = load_faiss_index() | |
with st.spinner(f"Generating embedding for {selected_major}..."): | |
# use the “Major Name (DegreeLevel)” that the user selected | |
major_embedding = get_major_embedding(selected_major_display) | |
with st.spinner("Performing semantic search..."): | |
# Fetch more results initially to allow for percentile filtering | |
semantic_results = perform_semantic_search(major_embedding, faiss_index, MAX_JOB_POSTINGS_FETCH * 4) | |
st.session_state.search_results = get_jobs_with_semantic_scores(semantic_results) | |
if not st.session_state.search_results.empty: | |
st.success(f"Search complete! Found {len(st.session_state.search_results)} relevant job postings.") | |
else: | |
st.warning("No relevant job postings found for this major.") | |
st.session_state.search_results = pd.DataFrame() | |
# Display results if they exist in session state | |
if 'search_results' in st.session_state and not st.session_state.search_results.empty: | |
results = st.session_state.search_results | |
current_major_display = st.session_state.get('last_selected_major', 'Selected Major') | |
# ── Cross‐Encoder Rerank using major description ── | |
cross_encoder = CrossEncoder(RERANK_MODEL) | |
# 1) Grab the exact same query text we used for FAISS | |
query_text = get_major_query_text(selected_major_display) | |
# 2) Build (query_text, job_desc) pairs | |
pairs = [(query_text, jd) for jd in results["description"].tolist()] | |
# 3) Cross-encode as before | |
cross_scores = cross_encoder.predict(pairs) | |
results["cross_score"] = cross_scores | |
results = results.sort_values("cross_score", ascending=False).reset_index(drop=True) | |
# 4) (Optional) Truncate to top‐N for display | |
TOP_N = st.sidebar.slider("Results to show", 5, 100, 50) | |
results = results.head(TOP_N).copy() | |
# ── Dynamic Role Clustering ── | |
# 1) Re-encode titles into embeddings | |
model = load_embedding_model() | |
titles = results["title"].tolist() | |
embs = model.encode(titles, convert_to_numpy=True) # ← this bit was missing | |
# 2) Cluster into up to 8 roles | |
n_roles = min(8, len(titles)) | |
kmeans = KMeans(n_clusters=n_roles, random_state=0).fit(embs) | |
results["cluster_id"] = kmeans.labels_ | |
# 3) Build human-readable names | |
centroids = kmeans.cluster_centers_ | |
role_names = [] | |
for cid, center in enumerate(centroids): | |
idxs = np.where(results["cluster_id"] == cid)[0] # positional indices | |
cluster_embs = embs[idxs] | |
# get the positional index of the closest embedding | |
winner_pos = idxs[np.argmin(np.linalg.norm(cluster_embs - center, axis=1))] | |
# use iloc to fetch by positional index | |
role_names.append(results.iloc[winner_pos]["title"]) | |
# 4) Map into new column | |
cluster_to_role = {i: name for i, name in enumerate(role_names)} | |
results["role_name"] = results["cluster_id"].map(cluster_to_role) | |
# ----------Beginning of "Visualization" section----------------- | |
viz = st.sidebar.selectbox( | |
"Choose a visualization", | |
["None", "Word Cloud", "Top-10 Bar Chart", "Treemap"], | |
index = 2 | |
) | |
if viz == "None": | |
st.info("No visualization selected. Use the sidebar to choose one.") | |
else: | |
st.header("🔍 At-a-Glance: Top Job Roles") | |
if viz == "Word Cloud": | |
# Sum relevancy by role | |
role_weights = ( | |
results | |
.groupby("role_name")["relevancy_score"] | |
.sum() | |
.to_dict() | |
) | |
# Generate cloud | |
wc = WordCloud( | |
width=800, height=400, | |
background_color="white", | |
max_words=50 | |
).generate_from_frequencies({r: int(s*100) for r, s in role_weights.items()}) | |
st.subheader("Role-Level Word Cloud") | |
st.image(wc.to_array(), use_container_width=True) | |
elif viz == "Top-10 Bar Chart": | |
# Let user pick metric | |
metric = st.sidebar.radio("Rank by:", ["Count", "Avg Relevancy"]) | |
field = "count" if metric=="Count" else "avg_rel" | |
# Aggregate on role_name | |
df_role = ( | |
results | |
.groupby("role_name") | |
.agg(count=("role_name","size"), avg_rel=("relevancy_score","mean")) | |
.reset_index() | |
.sort_values(field, ascending=False) | |
.head(10) | |
) | |
chart = ( | |
alt.Chart(df_role) | |
.mark_bar() | |
.encode( | |
x=alt.X(f"{field}:Q", title=metric), | |
y=alt.Y("role_name:N", sort='-x', title="Role"), | |
tooltip=["role_name","count","avg_rel"] | |
) | |
.properties(title=f"Top-10 Roles by {metric}", height=400) | |
) | |
st.altair_chart(chart, use_container_width=True) | |
with st.expander("View Data Table"): | |
st.table( | |
df_role.rename(columns={ | |
"role_name":"Role","count":"Count","avg_rel":"Avg. Relevancy" | |
}) | |
) | |
elif viz == "Treemap": | |
# 1) Prepare a two-level DataFrame | |
df_tree = ( | |
results | |
.groupby(["role_name", "title"]) | |
.agg( | |
count=("title", "size"), | |
avg_rel=("relevancy_score", "mean") | |
) | |
.reset_index() | |
) | |
# 2) Prune children: keep top 5 titles per role, aggregate the rest | |
def prune_children(df, top_n=5): | |
pieces = [] | |
for role, grp in df.groupby("role_name"): | |
# pick the top N by count | |
top = grp.nlargest(top_n, "count") | |
rest = grp.drop(top.index) | |
pieces.append(top) | |
if not rest.empty: | |
pieces.append(pd.DataFrame({ | |
"role_name": [role], | |
"title": ["Other Titles"], | |
"count": [rest["count"].sum()], | |
"avg_rel": [rest["avg_rel"].mean()] | |
})) | |
return pd.concat(pieces, ignore_index=True) | |
# apply pruning | |
df_tree = prune_children(df_tree, top_n=5) | |
# 3) Build a treemap showing both levels at once | |
fig = px.treemap( | |
df_tree, | |
path=["role_name", "title"], # level-0=role_name, level-1=title | |
values="count", | |
color="avg_rel", | |
color_continuous_scale="Viridis", | |
hover_data=["count", "avg_rel"], | |
title="Jobs Treemap (Roles → Titles)", | |
maxdepth=2 # always draw both levels | |
) | |
# 4) Improve padding & fonts for clarity | |
fig.update_traces( | |
tiling=dict(pad=3), # inner padding | |
outsidetextfont=dict(size=18, color="white"), # role labels | |
insidetextfont=dict(size=12, color="white"), # title labels | |
textinfo="label+value" # show name + count on each rectangle | |
) | |
# 5) Add breathing room and a clear colorbar title | |
fig.update_layout( | |
margin=dict(t=50, l=25, r=25, b=25), | |
coloraxis_colorbar=dict(title="Avg. Relevancy") | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# -----------------End of "Visualization" section----------------- | |
st.subheader(f"Job Postings for: {current_major_display}") | |
st.write("Results are ranked by semantic relevancy.") | |
# Pagination setup | |
JOBS_PER_PAGE = 10 | |
if 'current_page' not in st.session_state: | |
st.session_state.current_page = 0 | |
total_jobs = len(results) | |
total_pages = (total_jobs + JOBS_PER_PAGE - 1) // JOBS_PER_PAGE | |
start_index = st.session_state.current_page * JOBS_PER_PAGE | |
end_index = min(start_index + JOBS_PER_PAGE, total_jobs) | |
results_page = results.iloc[start_index:end_index] | |
# Display navigation buttons | |
nav_cols = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True) | |
with nav_cols[0]: | |
if st.session_state.current_page > 0: | |
if st.button("Previous"): | |
st.session_state.current_page -= 1 | |
st.rerun() | |
with nav_cols[0]: | |
if st.session_state.current_page > 0: | |
if st.button("First Page"): | |
st.session_state.current_page = 0 | |
st.rerun() | |
with nav_cols[1]: | |
# Page number selector | |
page_options = [i + 1 for i in range(total_pages)] | |
selected_page_display = st.selectbox( | |
"Go to Page:", | |
options=page_options, | |
index=st.session_state.current_page, | |
key="page_selector" | |
) | |
# Update current_page if selection changes | |
if selected_page_display - 1 != st.session_state.current_page: | |
st.session_state.current_page = selected_page_display - 1 | |
st.rerun() | |
with nav_cols[2]: | |
if st.session_state.current_page < total_pages - 1: | |
if st.button("Next"): | |
st.session_state.current_page += 1 | |
st.rerun() | |
with nav_cols[2]: | |
if st.session_state.current_page < total_pages - 1: | |
if st.button("Last Page"): | |
st.session_state.current_page = total_pages - 1 | |
st.rerun() | |
st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}") | |
if results_page.empty: | |
st.info("No job postings found for this page.") | |
else: | |
for index, row in results_page.iterrows(): | |
st.subheader(f"{row['title']} at {row['company_name']}") | |
st.write(f"**Location:** {row['location']} | **Experience Level:** {row['formatted_experience_level']} | **Relevancy Score:** {row['relevancy_score']:.2f}") | |
with st.expander("View Details"): | |
st.write(f"**Description:**") | |
st.markdown(row['description']) | |
if pd.notna(row['skills_desc']) and row['skills_desc']: | |
st.write(f"**Skills:**") | |
st.markdown(row['skills_desc']) | |
st.write(f"**Listed Time:** {row['listed_time']}") | |
st.write(f"**Work Type:** {row['formatted_work_type']}") | |
st.write(f"**Remote Allowed:** {'Yes' if row['remote_allowed'] else 'No'}") | |
salary_info = [] | |
if pd.notna(row['min_salary']) and pd.notna(row['max_salary']): | |
salary_info.append(f"{row['currency']} {row['min_salary']:.2f} - {row['max_salary']:.2f} {row['pay_period']}") | |
elif pd.notna(row['normalized_salary']): | |
salary_info.append(f"Normalized Salary: {row['currency']} {row['normalized_salary']:.2f}") | |
if salary_info: | |
st.write(f"**Salary:** {', '.join(salary_info)}") | |
else: | |
st.write("**Salary:** Not specified") | |
if pd.notna(row['job_posting_url']) and row['job_posting_url']: | |
st.markdown(f"**Job Posting URL:** [Link]({row['job_posting_url']})") | |
if pd.notna(row['application_url']) and row['application_url']: | |
st.markdown(f"**Application URL:** [Link]({row['application_url']})") | |
st.write(f"**Views:** {row['views']} | **Applies:** {row['applies']}") | |
st.markdown("---") | |
nav_cols_top = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True) | |
with nav_cols_top[0]: | |
if st.session_state.current_page > 0: | |
if st.button("Previous", key="prev_top"): | |
st.session_state.current_page -= 1 | |
st.rerun() | |
with nav_cols_top[0]: | |
if st.session_state.current_page > 0: | |
if st.button("First Page", key="first_top"): | |
st.session_state.current_page = 0 | |
st.rerun() | |
with nav_cols_top[1]: | |
page_options = [i + 1 for i in range(total_pages)] | |
selected_page_display_top = st.selectbox( | |
"Go to Page:", | |
options=page_options, | |
index=st.session_state.current_page, | |
key="page_selector_top" | |
) | |
if selected_page_display_top - 1 != st.session_state.current_page: | |
st.session_state.current_page = selected_page_display_top - 1 | |
st.rerun() | |
with nav_cols_top[2]: | |
if st.session_state.current_page < total_pages - 1: | |
if st.button("Next", key="next_top"): | |
st.session_state.current_page += 1 | |
st.rerun() | |
with nav_cols_top[2]: | |
if st.session_state.current_page < total_pages - 1: | |
if st.button("Last Page", key="last_top"): | |
st.session_state.current_page = total_pages - 1 | |
st.rerun() | |
st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}") | |
if not results.empty: | |
st.download_button( | |
"Download Results as CSV", | |
data=results.to_csv(index=False), | |
file_name="job_results.csv", | |
mime="text/csv" | |
) |