import streamlit as st import sqlite3 import pandas as pd import gdown import os from sentence_transformers import SentenceTransformer, CrossEncoder import faiss import numpy as np from wordcloud import WordCloud import altair as alt import textwrap import plotly.express as px from sklearn.cluster import KMeans # Path Constants MAJOR_DB_PATH = "./docs/majors.db" MAP_DB_PATH = "./docs/map.db" JOBS_DB_PATH = "./docs/jobs.db" FAISS_INDEX_PATH = "./docs/jobs_embeddings.faiss" # Secrets JOBS_GDRIVE_URL = os.environ.get("JOB_URL") FAISS_GDRIVE_URL = os.environ.get("FAISS_URL") # Model Constants EMBEDDINGS_MODEL = "mixedbread-ai/mxbai-embed-xsmall-v1" RERANK_MODEL = "mixedbread-ai/mxbai-rerank-xsmall-v1" dimensions=384 # Search Constants MAX_JOB_POSTINGS_FETCH = 100 SEMANTIC_SCORE_SCALE = 100.0 RELEVANCY_THRESHOLD = 0.1 # Ensure job DB exists locally def download_jobs_db(): if not os.path.exists(JOBS_DB_PATH): st.info("Downloading job postings database...") gdown.download(JOBS_GDRIVE_URL, JOBS_DB_PATH, quiet=False) # Ensure FAISS index exists locally def download_faiss_index(): if not os.path.exists(FAISS_INDEX_PATH): st.info("Downloading FAISS index...") gdown.download(FAISS_GDRIVE_URL, FAISS_INDEX_PATH, quiet=False) # Load hierarchical structure from majors.db @st.cache_data def load_major_hierarchy(): conn = sqlite3.connect(MAJOR_DB_PATH) df = pd.read_sql( "SELECT DISTINCT School, Department, [Major Name] AS Major, [Degree Level] AS DegreeLevel FROM majors;", conn, ) conn.close() return df # Load embedding model @st.cache_resource def load_embedding_model(): return SentenceTransformer(EMBEDDINGS_MODEL, truncate_dim=dimensions) # Load FAISS index and job ID map @st.cache_resource def load_faiss_index(): download_faiss_index() index = faiss.read_index(FAISS_INDEX_PATH) return index # Generate embedding for a major @st.cache_data def get_major_embedding(major_display: str): """ major_display is of the form "Major Name (DegreeLevel)". We parse out both pieces, lookup description, and encode all three. """ model = load_embedding_model() # 1) parse the display into name & degree if "(" in major_display and major_display.endswith(")"): name, degree = major_display.rsplit("(", 1) name = name.strip() degree = degree[:-1] # drop trailing ")" else: name, degree = major_display, "" # 2) fetch the rich description from majors.db conn = sqlite3.connect(MAJOR_DB_PATH) row = conn.execute( "SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?", (name, degree) ).fetchone() conn.close() desc = row[0] if row and row[0] else "" # 3) build the full prompt full_text = f"{name} ({degree}). {desc}" # 4) embed emb = model.encode(full_text, prompt_name="query", convert_to_numpy=True) emb = np.array(emb, dtype='float32') faiss.normalize_L2(emb.reshape(1, -1)) return emb @st.cache_data def get_major_query_text(major_display: str) -> str: # parse out name & degree exactly like get_major_embedding if "(" in major_display and major_display.endswith(")"): name, degree = major_display.rsplit("(", 1) name = name.strip() degree = degree[:-1] else: name, degree = major_display, "" # fetch the same description conn = sqlite3.connect(MAJOR_DB_PATH) row = conn.execute( "SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?", (name, degree) ).fetchone() conn.close() desc = row[0] if row and row[0] else "" # rebuild the exact query text return f"{name} ({degree}). {desc}" # Perform semantic search using FAISS @st.cache_data def perform_semantic_search(major_embedding, _faiss_index, k_results): D, I = _faiss_index.search(major_embedding.reshape(1, -1), k_results) results = [] for idx64, score in zip(I[0], D[0]): if idx64 == -1: continue job_id = int(idx64) # this *is* your job_id results.append({'job_id': job_id, 'semantic_score': float(score)}) return pd.DataFrame(results) # Fetch jobs by ID and calculate relevancy in Python def get_jobs_with_semantic_scores(job_ids_with_scores): if job_ids_with_scores.empty: return pd.DataFrame() job_ids = job_ids_with_scores['job_id'].tolist() placeholders = ','.join(['?' for _ in job_ids]) conn = sqlite3.connect(JOBS_DB_PATH) sql = f"SELECT * FROM job_postings WHERE job_id IN ({placeholders});" jobs_df = pd.read_sql(sql, conn, params=job_ids) conn.close() # Merge with semantic scores merged_df = pd.merge(jobs_df, job_ids_with_scores, left_on='job_id', right_on='job_id', how='inner') # Calculate relevancy score merged_df['relevancy_score'] = merged_df['semantic_score'] * SEMANTIC_SCORE_SCALE # Calculate the 25th percentile of relevancy scores # Only calculate if there are enough scores to make sense, otherwise use a default low threshold if not merged_df.empty and len(merged_df) >= 4: # Ensure at least 4 elements for 25th percentile percentile_threshold = np.percentile(merged_df['relevancy_score'], 25) else: percentile_threshold = 0.0 # Fallback to a very low threshold if not enough data # Filter by the dynamic percentile threshold filtered_df = merged_df[merged_df['relevancy_score'] >= percentile_threshold] # Sort by relevancy score and limit to MAX_JOB_POSTINGS_FETCH sorted_df = filtered_df.sort_values(by='relevancy_score', ascending=False) return sorted_df.head(MAX_JOB_POSTINGS_FETCH) # Run query on jobs.db def query_jobs(sql_query, params): conn = sqlite3.connect(JOBS_DB_PATH) df = pd.read_sql(sql_query, conn, params=params) conn.close() return df # Streamlit UI st.set_page_config( page_title="Major-to-Job Explorer", layout="centered", initial_sidebar_state="expanded" ) st.title("🎓 Major-to-Job Postings Explorer") # Download job DB and FAISS index if needed download_jobs_db() download_faiss_index() # Load hierarchy hierarchy_df = load_major_hierarchy() # Step 1: Select School schools = sorted(hierarchy_df["School"].unique()) selected_school = st.selectbox("Select a School:", schools) if selected_school: departments = sorted(hierarchy_df[hierarchy_df["School"] == selected_school]["Department"].unique()) selected_department = st.selectbox("Select a Department:", departments) if selected_department: majors_df = hierarchy_df[ (hierarchy_df["School"] == selected_school) & (hierarchy_df["Department"] == selected_department) ].copy() # Create a display string for the selectbox majors_df["Display"] = majors_df["Major"] + " (" + majors_df["DegreeLevel"] + ")" # Create a mapping from display string to actual major name major_display_to_name = dict(zip(majors_df["Display"], majors_df["Major"])) # Sort the display names display_majors = sorted(majors_df["Display"].unique()) selected_major_display = st.selectbox("Select a Major:", display_majors) # Get the actual major name from the display name selected_major = major_display_to_name.get(selected_major_display) search_button = st.button("Search Jobs") if search_button and selected_major: # Reset pagination when a new search is initiated st.session_state.current_page = 0 st.session_state.last_selected_major = selected_major with st.spinner("Loading semantic data..."): faiss_index = load_faiss_index() with st.spinner(f"Generating embedding for {selected_major}..."): # use the “Major Name (DegreeLevel)” that the user selected major_embedding = get_major_embedding(selected_major_display) with st.spinner("Performing semantic search..."): # Fetch more results initially to allow for percentile filtering semantic_results = perform_semantic_search(major_embedding, faiss_index, MAX_JOB_POSTINGS_FETCH * 4) st.session_state.search_results = get_jobs_with_semantic_scores(semantic_results) if not st.session_state.search_results.empty: st.success(f"Search complete! Found {len(st.session_state.search_results)} relevant job postings.") else: st.warning("No relevant job postings found for this major.") st.session_state.search_results = pd.DataFrame() # Display results if they exist in session state if 'search_results' in st.session_state and not st.session_state.search_results.empty: results = st.session_state.search_results current_major_display = st.session_state.get('last_selected_major', 'Selected Major') # ── Cross‐Encoder Rerank using major description ── cross_encoder = CrossEncoder(RERANK_MODEL) # 1) Grab the exact same query text we used for FAISS query_text = get_major_query_text(selected_major_display) # 2) Build (query_text, job_desc) pairs pairs = [(query_text, jd) for jd in results["description"].tolist()] # 3) Cross-encode as before cross_scores = cross_encoder.predict(pairs) results["cross_score"] = cross_scores results = results.sort_values("cross_score", ascending=False).reset_index(drop=True) # 4) (Optional) Truncate to top‐N for display TOP_N = st.sidebar.slider("Results to show", 5, 100, 50) results = results.head(TOP_N).copy() # ── Dynamic Role Clustering ── # 1) Re-encode titles into embeddings model = load_embedding_model() titles = results["title"].tolist() embs = model.encode(titles, convert_to_numpy=True) # ← this bit was missing # 2) Cluster into up to 8 roles n_roles = min(8, len(titles)) kmeans = KMeans(n_clusters=n_roles, random_state=0).fit(embs) results["cluster_id"] = kmeans.labels_ # 3) Build human-readable names centroids = kmeans.cluster_centers_ role_names = [] for cid, center in enumerate(centroids): idxs = np.where(results["cluster_id"] == cid)[0] # positional indices cluster_embs = embs[idxs] # get the positional index of the closest embedding winner_pos = idxs[np.argmin(np.linalg.norm(cluster_embs - center, axis=1))] # use iloc to fetch by positional index role_names.append(results.iloc[winner_pos]["title"]) # 4) Map into new column cluster_to_role = {i: name for i, name in enumerate(role_names)} results["role_name"] = results["cluster_id"].map(cluster_to_role) # ----------Beginning of "Visualization" section----------------- viz = st.sidebar.selectbox( "Choose a visualization", ["None", "Word Cloud", "Top-10 Bar Chart", "Treemap"], index = 2 ) if viz == "None": st.info("No visualization selected. Use the sidebar to choose one.") else: st.header("🔍 At-a-Glance: Top Job Roles") if viz == "Word Cloud": # Sum relevancy by role role_weights = ( results .groupby("role_name")["relevancy_score"] .sum() .to_dict() ) # Generate cloud wc = WordCloud( width=800, height=400, background_color="white", max_words=50 ).generate_from_frequencies({r: int(s*100) for r, s in role_weights.items()}) st.subheader("Role-Level Word Cloud") st.image(wc.to_array(), use_container_width=True) elif viz == "Top-10 Bar Chart": # Let user pick metric metric = st.sidebar.radio("Rank by:", ["Count", "Avg Relevancy"]) field = "count" if metric=="Count" else "avg_rel" # Aggregate on role_name df_role = ( results .groupby("role_name") .agg(count=("role_name","size"), avg_rel=("relevancy_score","mean")) .reset_index() .sort_values(field, ascending=False) .head(10) ) chart = ( alt.Chart(df_role) .mark_bar() .encode( x=alt.X(f"{field}:Q", title=metric), y=alt.Y("role_name:N", sort='-x', title="Role"), tooltip=["role_name","count","avg_rel"] ) .properties(title=f"Top-10 Roles by {metric}", height=400) ) st.altair_chart(chart, use_container_width=True) with st.expander("View Data Table"): st.table( df_role.rename(columns={ "role_name":"Role","count":"Count","avg_rel":"Avg. Relevancy" }) ) elif viz == "Treemap": # 1) Prepare a two-level DataFrame df_tree = ( results .groupby(["role_name", "title"]) .agg( count=("title", "size"), avg_rel=("relevancy_score", "mean") ) .reset_index() ) # 2) Prune children: keep top 5 titles per role, aggregate the rest def prune_children(df, top_n=5): pieces = [] for role, grp in df.groupby("role_name"): # pick the top N by count top = grp.nlargest(top_n, "count") rest = grp.drop(top.index) pieces.append(top) if not rest.empty: pieces.append(pd.DataFrame({ "role_name": [role], "title": ["Other Titles"], "count": [rest["count"].sum()], "avg_rel": [rest["avg_rel"].mean()] })) return pd.concat(pieces, ignore_index=True) # apply pruning df_tree = prune_children(df_tree, top_n=5) # 3) Build a treemap showing both levels at once fig = px.treemap( df_tree, path=["role_name", "title"], # level-0=role_name, level-1=title values="count", color="avg_rel", color_continuous_scale="Viridis", hover_data=["count", "avg_rel"], title="Jobs Treemap (Roles → Titles)", maxdepth=2 # always draw both levels ) # 4) Improve padding & fonts for clarity fig.update_traces( tiling=dict(pad=3), # inner padding outsidetextfont=dict(size=18, color="white"), # role labels insidetextfont=dict(size=12, color="white"), # title labels textinfo="label+value" # show name + count on each rectangle ) # 5) Add breathing room and a clear colorbar title fig.update_layout( margin=dict(t=50, l=25, r=25, b=25), coloraxis_colorbar=dict(title="Avg. Relevancy") ) st.plotly_chart(fig, use_container_width=True) # -----------------End of "Visualization" section----------------- st.subheader(f"Job Postings for: {current_major_display}") st.write("Results are ranked by semantic relevancy.") # Pagination setup JOBS_PER_PAGE = 10 if 'current_page' not in st.session_state: st.session_state.current_page = 0 total_jobs = len(results) total_pages = (total_jobs + JOBS_PER_PAGE - 1) // JOBS_PER_PAGE start_index = st.session_state.current_page * JOBS_PER_PAGE end_index = min(start_index + JOBS_PER_PAGE, total_jobs) results_page = results.iloc[start_index:end_index] # Display navigation buttons nav_cols = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True) with nav_cols[0]: if st.session_state.current_page > 0: if st.button("Previous"): st.session_state.current_page -= 1 st.rerun() with nav_cols[0]: if st.session_state.current_page > 0: if st.button("First Page"): st.session_state.current_page = 0 st.rerun() with nav_cols[1]: # Page number selector page_options = [i + 1 for i in range(total_pages)] selected_page_display = st.selectbox( "Go to Page:", options=page_options, index=st.session_state.current_page, key="page_selector" ) # Update current_page if selection changes if selected_page_display - 1 != st.session_state.current_page: st.session_state.current_page = selected_page_display - 1 st.rerun() with nav_cols[2]: if st.session_state.current_page < total_pages - 1: if st.button("Next"): st.session_state.current_page += 1 st.rerun() with nav_cols[2]: if st.session_state.current_page < total_pages - 1: if st.button("Last Page"): st.session_state.current_page = total_pages - 1 st.rerun() st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}") if results_page.empty: st.info("No job postings found for this page.") else: for index, row in results_page.iterrows(): st.subheader(f"{row['title']} at {row['company_name']}") st.write(f"**Location:** {row['location']} | **Experience Level:** {row['formatted_experience_level']} | **Relevancy Score:** {row['relevancy_score']:.2f}") with st.expander("View Details"): st.write(f"**Description:**") st.markdown(row['description']) if pd.notna(row['skills_desc']) and row['skills_desc']: st.write(f"**Skills:**") st.markdown(row['skills_desc']) st.write(f"**Listed Time:** {row['listed_time']}") st.write(f"**Work Type:** {row['formatted_work_type']}") st.write(f"**Remote Allowed:** {'Yes' if row['remote_allowed'] else 'No'}") salary_info = [] if pd.notna(row['min_salary']) and pd.notna(row['max_salary']): salary_info.append(f"{row['currency']} {row['min_salary']:.2f} - {row['max_salary']:.2f} {row['pay_period']}") elif pd.notna(row['normalized_salary']): salary_info.append(f"Normalized Salary: {row['currency']} {row['normalized_salary']:.2f}") if salary_info: st.write(f"**Salary:** {', '.join(salary_info)}") else: st.write("**Salary:** Not specified") if pd.notna(row['job_posting_url']) and row['job_posting_url']: st.markdown(f"**Job Posting URL:** [Link]({row['job_posting_url']})") if pd.notna(row['application_url']) and row['application_url']: st.markdown(f"**Application URL:** [Link]({row['application_url']})") st.write(f"**Views:** {row['views']} | **Applies:** {row['applies']}") st.markdown("---") nav_cols_top = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True) with nav_cols_top[0]: if st.session_state.current_page > 0: if st.button("Previous", key="prev_top"): st.session_state.current_page -= 1 st.rerun() with nav_cols_top[0]: if st.session_state.current_page > 0: if st.button("First Page", key="first_top"): st.session_state.current_page = 0 st.rerun() with nav_cols_top[1]: page_options = [i + 1 for i in range(total_pages)] selected_page_display_top = st.selectbox( "Go to Page:", options=page_options, index=st.session_state.current_page, key="page_selector_top" ) if selected_page_display_top - 1 != st.session_state.current_page: st.session_state.current_page = selected_page_display_top - 1 st.rerun() with nav_cols_top[2]: if st.session_state.current_page < total_pages - 1: if st.button("Next", key="next_top"): st.session_state.current_page += 1 st.rerun() with nav_cols_top[2]: if st.session_state.current_page < total_pages - 1: if st.button("Last Page", key="last_top"): st.session_state.current_page = total_pages - 1 st.rerun() st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}") if not results.empty: st.download_button( "Download Results as CSV", data=results.to_csv(index=False), file_name="job_results.csv", mime="text/csv" )