Spaces:

sahsan
/

majors_explorer_test

Sleeping

App Files Files Community

majors_explorer_test / src /streamlit_app.py

sahsan

fixed syntax error

8a63634 about 1 month ago

raw

history blame contribute delete

23.6 kB

	import streamlit as st
	import sqlite3
	import pandas as pd
	import gdown
	import os
	from sentence_transformers import SentenceTransformer, CrossEncoder
	import faiss
	import numpy as np
	from wordcloud import WordCloud
	import altair as alt
	import textwrap
	import plotly.express as px
	from sklearn.cluster import KMeans

	# Path Constants
	MAJOR_DB_PATH = "./docs/majors.db"
	MAP_DB_PATH = "./docs/map.db"
	JOBS_DB_PATH = "./docs/jobs.db"
	FAISS_INDEX_PATH = "./docs/jobs_embeddings.faiss"

	# Secrets
	JOBS_GDRIVE_URL = os.environ.get("JOB_URL")
	FAISS_GDRIVE_URL = os.environ.get("FAISS_URL")

	# Model Constants
	EMBEDDINGS_MODEL = "mixedbread-ai/mxbai-embed-xsmall-v1"
	RERANK_MODEL = "mixedbread-ai/mxbai-rerank-xsmall-v1"
	dimensions=384

	# Search Constants
	MAX_JOB_POSTINGS_FETCH = 100
	SEMANTIC_SCORE_SCALE = 100.0
	RELEVANCY_THRESHOLD = 0.1

	# Ensure job DB exists locally
	def download_jobs_db():
	if not os.path.exists(JOBS_DB_PATH):
	st.info("Downloading job postings database...")
	gdown.download(JOBS_GDRIVE_URL, JOBS_DB_PATH, quiet=False)

	# Ensure FAISS index exists locally
	def download_faiss_index():
	if not os.path.exists(FAISS_INDEX_PATH):
	st.info("Downloading FAISS index...")
	gdown.download(FAISS_GDRIVE_URL, FAISS_INDEX_PATH, quiet=False)

	# Load hierarchical structure from majors.db
	@st.cache_data
	def load_major_hierarchy():
	conn = sqlite3.connect(MAJOR_DB_PATH)
	df = pd.read_sql(
	"SELECT DISTINCT School, Department, [Major Name] AS Major, [Degree Level] AS DegreeLevel FROM majors;",
	conn,
	)
	conn.close()
	return df

	# Load embedding model
	@st.cache_resource
	def load_embedding_model():
	return SentenceTransformer(EMBEDDINGS_MODEL, truncate_dim=dimensions)

	# Load FAISS index and job ID map
	@st.cache_resource
	def load_faiss_index():
	download_faiss_index()
	index = faiss.read_index(FAISS_INDEX_PATH)
	return index

	# Generate embedding for a major
	@st.cache_data
	def get_major_embedding(major_display: str):
	"""
	major_display is of the form "Major Name (DegreeLevel)".
	We parse out both pieces, lookup description, and encode all three.
	"""
	model = load_embedding_model()

	# 1) parse the display into name & degree
	if "(" in major_display and major_display.endswith(")"):
	name, degree = major_display.rsplit("(", 1)
	name = name.strip()
	degree = degree[:-1] # drop trailing ")"
	else:
	name, degree = major_display, ""

	# 2) fetch the rich description from majors.db
	conn = sqlite3.connect(MAJOR_DB_PATH)
	row = conn.execute(
	"SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?",
	(name, degree)
	).fetchone()
	conn.close()
	desc = row[0] if row and row[0] else ""

	# 3) build the full prompt
	full_text = f"{name} ({degree}). {desc}"

	# 4) embed
	emb = model.encode(full_text, prompt_name="query", convert_to_numpy=True)
	emb = np.array(emb, dtype='float32')
	faiss.normalize_L2(emb.reshape(1, -1))
	return emb

	@st.cache_data
	def get_major_query_text(major_display: str) -> str:
	# parse out name & degree exactly like get_major_embedding
	if "(" in major_display and major_display.endswith(")"):
	name, degree = major_display.rsplit("(", 1)
	name = name.strip()
	degree = degree[:-1]
	else:
	name, degree = major_display, ""
	# fetch the same description
	conn = sqlite3.connect(MAJOR_DB_PATH)
	row = conn.execute(
	"SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?",
	(name, degree)
	).fetchone()
	conn.close()
	desc = row[0] if row and row[0] else ""
	# rebuild the exact query text
	return f"{name} ({degree}). {desc}"

	# Perform semantic search using FAISS
	@st.cache_data
	def perform_semantic_search(major_embedding, _faiss_index, k_results):
	D, I = _faiss_index.search(major_embedding.reshape(1, -1), k_results)
	results = []
	for idx64, score in zip(I[0], D[0]):
	if idx64 == -1:
	continue
	job_id = int(idx64) # this is your job_id
	results.append({'job_id': job_id, 'semantic_score': float(score)})
	return pd.DataFrame(results)

	# Fetch jobs by ID and calculate relevancy in Python
	def get_jobs_with_semantic_scores(job_ids_with_scores):
	if job_ids_with_scores.empty:
	return pd.DataFrame()

	job_ids = job_ids_with_scores['job_id'].tolist()
	placeholders = ','.join(['?' for _ in job_ids])

	conn = sqlite3.connect(JOBS_DB_PATH)
	sql = f"SELECT * FROM job_postings WHERE job_id IN ({placeholders});"
	jobs_df = pd.read_sql(sql, conn, params=job_ids)
	conn.close()

	# Merge with semantic scores
	merged_df = pd.merge(jobs_df, job_ids_with_scores, left_on='job_id', right_on='job_id', how='inner')

	# Calculate relevancy score
	merged_df['relevancy_score'] = merged_df['semantic_score'] * SEMANTIC_SCORE_SCALE

	# Calculate the 25th percentile of relevancy scores
	# Only calculate if there are enough scores to make sense, otherwise use a default low threshold
	if not merged_df.empty and len(merged_df) >= 4: # Ensure at least 4 elements for 25th percentile
	percentile_threshold = np.percentile(merged_df['relevancy_score'], 25)
	else:
	percentile_threshold = 0.0 # Fallback to a very low threshold if not enough data

	# Filter by the dynamic percentile threshold
	filtered_df = merged_df[merged_df['relevancy_score'] >= percentile_threshold]

	# Sort by relevancy score and limit to MAX_JOB_POSTINGS_FETCH
	sorted_df = filtered_df.sort_values(by='relevancy_score', ascending=False)

	return sorted_df.head(MAX_JOB_POSTINGS_FETCH)


	# Run query on jobs.db
	def query_jobs(sql_query, params):
	conn = sqlite3.connect(JOBS_DB_PATH)
	df = pd.read_sql(sql_query, conn, params=params)
	conn.close()
	return df

	# Streamlit UI
	st.set_page_config(
	page_title="Major-to-Job Explorer",
	layout="centered",
	initial_sidebar_state="expanded"
	)
	st.title("🎓 Major-to-Job Postings Explorer")

	# Download job DB and FAISS index if needed
	download_jobs_db()
	download_faiss_index()

	# Load hierarchy
	hierarchy_df = load_major_hierarchy()

	# Step 1: Select School
	schools = sorted(hierarchy_df["School"].unique())
	selected_school = st.selectbox("Select a School:", schools)

	if selected_school:
	departments = sorted(hierarchy_df[hierarchy_df["School"] == selected_school]["Department"].unique())
	selected_department = st.selectbox("Select a Department:", departments)

	if selected_department:
	majors_df = hierarchy_df[
	(hierarchy_df["School"] == selected_school) &
	(hierarchy_df["Department"] == selected_department)
	].copy()

	# Create a display string for the selectbox
	majors_df["Display"] = majors_df["Major"] + " (" + majors_df["DegreeLevel"] + ")"

	# Create a mapping from display string to actual major name
	major_display_to_name = dict(zip(majors_df["Display"], majors_df["Major"]))

	# Sort the display names
	display_majors = sorted(majors_df["Display"].unique())

	selected_major_display = st.selectbox("Select a Major:", display_majors)

	# Get the actual major name from the display name
	selected_major = major_display_to_name.get(selected_major_display)

	search_button = st.button("Search Jobs")

	if search_button and selected_major:
	# Reset pagination when a new search is initiated
	st.session_state.current_page = 0
	st.session_state.last_selected_major = selected_major

	with st.spinner("Loading semantic data..."):
	faiss_index = load_faiss_index()

	with st.spinner(f"Generating embedding for {selected_major}..."):
	# use the “Major Name (DegreeLevel)” that the user selected
	major_embedding = get_major_embedding(selected_major_display)

	with st.spinner("Performing semantic search..."):
	# Fetch more results initially to allow for percentile filtering
	semantic_results = perform_semantic_search(major_embedding, faiss_index, MAX_JOB_POSTINGS_FETCH * 4)
	st.session_state.search_results = get_jobs_with_semantic_scores(semantic_results)

	if not st.session_state.search_results.empty:
	st.success(f"Search complete! Found {len(st.session_state.search_results)} relevant job postings.")
	else:
	st.warning("No relevant job postings found for this major.")
	st.session_state.search_results = pd.DataFrame()

	# Display results if they exist in session state
	if 'search_results' in st.session_state and not st.session_state.search_results.empty:
	results = st.session_state.search_results
	current_major_display = st.session_state.get('last_selected_major', 'Selected Major')

	# ── Cross‐Encoder Rerank using major description ──
	cross_encoder = CrossEncoder(RERANK_MODEL)

	# 1) Grab the exact same query text we used for FAISS
	query_text = get_major_query_text(selected_major_display)

	# 2) Build (query_text, job_desc) pairs
	pairs = [(query_text, jd) for jd in results["description"].tolist()]

	# 3) Cross-encode as before
	cross_scores = cross_encoder.predict(pairs)
	results["cross_score"] = cross_scores
	results = results.sort_values("cross_score", ascending=False).reset_index(drop=True)

	# 4) (Optional) Truncate to top‐N for display
	TOP_N = st.sidebar.slider("Results to show", 5, 100, 50)
	results = results.head(TOP_N).copy()

	# ── Dynamic Role Clustering ──
	# 1) Re-encode titles into embeddings
	model = load_embedding_model()
	titles = results["title"].tolist()
	embs = model.encode(titles, convert_to_numpy=True) # ← this bit was missing

	# 2) Cluster into up to 8 roles
	n_roles = min(8, len(titles))
	kmeans = KMeans(n_clusters=n_roles, random_state=0).fit(embs)
	results["cluster_id"] = kmeans.labels_

	# 3) Build human-readable names
	centroids = kmeans.cluster_centers_
	role_names = []
	for cid, center in enumerate(centroids):
	idxs = np.where(results["cluster_id"] == cid)[0] # positional indices
	cluster_embs = embs[idxs]
	# get the positional index of the closest embedding
	winner_pos = idxs[np.argmin(np.linalg.norm(cluster_embs - center, axis=1))]
	# use iloc to fetch by positional index
	role_names.append(results.iloc[winner_pos]["title"])

	# 4) Map into new column
	cluster_to_role = {i: name for i, name in enumerate(role_names)}
	results["role_name"] = results["cluster_id"].map(cluster_to_role)

	# ----------Beginning of "Visualization" section-----------------
	viz = st.sidebar.selectbox(
	"Choose a visualization",
	["None", "Word Cloud", "Top-10 Bar Chart", "Treemap"],
	index = 2
	)
	if viz == "None":
	st.info("No visualization selected. Use the sidebar to choose one.")
	else:
	st.header("🔍 At-a-Glance: Top Job Roles")

	if viz == "Word Cloud":
	# Sum relevancy by role
	role_weights = (
	results
	.groupby("role_name")["relevancy_score"]
	.sum()
	.to_dict()
	)

	# Generate cloud
	wc = WordCloud(
	width=800, height=400,
	background_color="white",
	max_words=50
	).generate_from_frequencies({r: int(s*100) for r, s in role_weights.items()})

	st.subheader("Role-Level Word Cloud")
	st.image(wc.to_array(), use_container_width=True)

	elif viz == "Top-10 Bar Chart":
	# Let user pick metric
	metric = st.sidebar.radio("Rank by:", ["Count", "Avg Relevancy"])
	field = "count" if metric=="Count" else "avg_rel"

	# Aggregate on role_name
	df_role = (
	results
	.groupby("role_name")
	.agg(count=("role_name","size"), avg_rel=("relevancy_score","mean"))
	.reset_index()
	.sort_values(field, ascending=False)
	.head(10)
	)

	chart = (
	alt.Chart(df_role)
	.mark_bar()
	.encode(
	x=alt.X(f"{field}:Q", title=metric),
	y=alt.Y("role_name:N", sort='-x', title="Role"),
	tooltip=["role_name","count","avg_rel"]
	)
	.properties(title=f"Top-10 Roles by {metric}", height=400)
	)

	st.altair_chart(chart, use_container_width=True)

	with st.expander("View Data Table"):
	st.table(
	df_role.rename(columns={
	"role_name":"Role","count":"Count","avg_rel":"Avg. Relevancy"
	})
	)

	elif viz == "Treemap":
	# 1) Prepare a two-level DataFrame
	df_tree = (
	results
	.groupby(["role_name", "title"])
	.agg(
	count=("title", "size"),
	avg_rel=("relevancy_score", "mean")
	)
	.reset_index()
	)

	# 2) Prune children: keep top 5 titles per role, aggregate the rest
	def prune_children(df, top_n=5):
	pieces = []
	for role, grp in df.groupby("role_name"):
	# pick the top N by count
	top = grp.nlargest(top_n, "count")
	rest = grp.drop(top.index)
	pieces.append(top)
	if not rest.empty:
	pieces.append(pd.DataFrame({
	"role_name": [role],
	"title": ["Other Titles"],
	"count": [rest["count"].sum()],
	"avg_rel": [rest["avg_rel"].mean()]
	}))
	return pd.concat(pieces, ignore_index=True)

	# apply pruning
	df_tree = prune_children(df_tree, top_n=5)

	# 3) Build a treemap showing both levels at once
	fig = px.treemap(
	df_tree,
	path=["role_name", "title"], # level-0=role_name, level-1=title
	values="count",
	color="avg_rel",
	color_continuous_scale="Viridis",
	hover_data=["count", "avg_rel"],
	title="Jobs Treemap (Roles → Titles)",
	maxdepth=2 # always draw both levels
	)

	# 4) Improve padding & fonts for clarity
	fig.update_traces(
	tiling=dict(pad=3), # inner padding
	outsidetextfont=dict(size=18, color="white"), # role labels
	insidetextfont=dict(size=12, color="white"), # title labels
	textinfo="label+value" # show name + count on each rectangle
	)

	# 5) Add breathing room and a clear colorbar title
	fig.update_layout(
	margin=dict(t=50, l=25, r=25, b=25),
	coloraxis_colorbar=dict(title="Avg. Relevancy")
	)

	st.plotly_chart(fig, use_container_width=True)

	# -----------------End of "Visualization" section-----------------



	st.subheader(f"Job Postings for: {current_major_display}")
	st.write("Results are ranked by semantic relevancy.")

	# Pagination setup
	JOBS_PER_PAGE = 10
	if 'current_page' not in st.session_state:
	st.session_state.current_page = 0

	total_jobs = len(results)
	total_pages = (total_jobs + JOBS_PER_PAGE - 1) // JOBS_PER_PAGE

	start_index = st.session_state.current_page * JOBS_PER_PAGE
	end_index = min(start_index + JOBS_PER_PAGE, total_jobs)

	results_page = results.iloc[start_index:end_index]

	# Display navigation buttons
	nav_cols = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True)
	with nav_cols[0]:
	if st.session_state.current_page > 0:
	if st.button("Previous"):
	st.session_state.current_page -= 1
	st.rerun()
	with nav_cols[0]:
	if st.session_state.current_page > 0:
	if st.button("First Page"):
	st.session_state.current_page = 0
	st.rerun()
	with nav_cols[1]:
	# Page number selector
	page_options = [i + 1 for i in range(total_pages)]
	selected_page_display = st.selectbox(
	"Go to Page:",
	options=page_options,
	index=st.session_state.current_page,
	key="page_selector"
	)
	# Update current_page if selection changes
	if selected_page_display - 1 != st.session_state.current_page:
	st.session_state.current_page = selected_page_display - 1
	st.rerun()
	with nav_cols[2]:
	if st.session_state.current_page < total_pages - 1:
	if st.button("Next"):
	st.session_state.current_page += 1
	st.rerun()
	with nav_cols[2]:
	if st.session_state.current_page < total_pages - 1:
	if st.button("Last Page"):
	st.session_state.current_page = total_pages - 1
	st.rerun()

	st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}")

	if results_page.empty:
	st.info("No job postings found for this page.")
	else:
	for index, row in results_page.iterrows():
	st.subheader(f"{row['title']} at {row['company_name']}")
	st.write(f"Location: {row['location']} \| Experience Level: {row['formatted_experience_level']} \| Relevancy Score: {row['relevancy_score']:.2f}")

	with st.expander("View Details"):
	st.write(f"Description:")
	st.markdown(row['description'])

	if pd.notna(row['skills_desc']) and row['skills_desc']:
	st.write(f"Skills:")
	st.markdown(row['skills_desc'])

	st.write(f"Listed Time: {row['listed_time']}")
	st.write(f"Work Type: {row['formatted_work_type']}")
	st.write(f"Remote Allowed: {'Yes' if row['remote_allowed'] else 'No'}")

	salary_info = []
	if pd.notna(row['min_salary']) and pd.notna(row['max_salary']):
	salary_info.append(f"{row['currency']} {row['min_salary']:.2f} - {row['max_salary']:.2f} {row['pay_period']}")
	elif pd.notna(row['normalized_salary']):
	salary_info.append(f"Normalized Salary: {row['currency']} {row['normalized_salary']:.2f}")
	if salary_info:
	st.write(f"Salary: {', '.join(salary_info)}")
	else:
	st.write("Salary: Not specified")

	if pd.notna(row['job_posting_url']) and row['job_posting_url']:
	st.markdown(f"Job Posting URL: [Link]({row['job_posting_url']})")
	if pd.notna(row['application_url']) and row['application_url']:
	st.markdown(f"Application URL: [Link]({row['application_url']})")

	st.write(f"Views: {row['views']} \| Applies: {row['applies']}")
	st.markdown("---")
	nav_cols_top = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True)
	with nav_cols_top[0]:
	if st.session_state.current_page > 0:
	if st.button("Previous", key="prev_top"):
	st.session_state.current_page -= 1
	st.rerun()
	with nav_cols_top[0]:
	if st.session_state.current_page > 0:
	if st.button("First Page", key="first_top"):
	st.session_state.current_page = 0
	st.rerun()
	with nav_cols_top[1]:
	page_options = [i + 1 for i in range(total_pages)]
	selected_page_display_top = st.selectbox(
	"Go to Page:",
	options=page_options,
	index=st.session_state.current_page,
	key="page_selector_top"
	)
	if selected_page_display_top - 1 != st.session_state.current_page:
	st.session_state.current_page = selected_page_display_top - 1
	st.rerun()
	with nav_cols_top[2]:
	if st.session_state.current_page < total_pages - 1:
	if st.button("Next", key="next_top"):
	st.session_state.current_page += 1
	st.rerun()
	with nav_cols_top[2]:
	if st.session_state.current_page < total_pages - 1:
	if st.button("Last Page", key="last_top"):
	st.session_state.current_page = total_pages - 1
	st.rerun()

	st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}")

	if not results.empty:
	st.download_button(
	"Download Results as CSV",
	data=results.to_csv(index=False),
	file_name="job_results.csv",
	mime="text/csv"
	)