Spaces:

sahsan
/

majors_explorer_test

Sleeping

File size: 23,639 Bytes

import streamlit as st
import sqlite3
import pandas as pd
import gdown
import os
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import numpy as np
from wordcloud import WordCloud
import altair as alt
import textwrap
import plotly.express as px
from sklearn.cluster import KMeans

# Path Constants
MAJOR_DB_PATH = "./docs/majors.db"
MAP_DB_PATH = "./docs/map.db"
JOBS_DB_PATH = "./docs/jobs.db"
FAISS_INDEX_PATH = "./docs/jobs_embeddings.faiss"

# Secrets
JOBS_GDRIVE_URL = os.environ.get("JOB_URL")
FAISS_GDRIVE_URL = os.environ.get("FAISS_URL")

# Model Constants
EMBEDDINGS_MODEL = "mixedbread-ai/mxbai-embed-xsmall-v1"
RERANK_MODEL = "mixedbread-ai/mxbai-rerank-xsmall-v1"
dimensions=384

# Search Constants
MAX_JOB_POSTINGS_FETCH = 100
SEMANTIC_SCORE_SCALE = 100.0
RELEVANCY_THRESHOLD = 0.1

# Ensure job DB exists locally
def download_jobs_db():
    if not os.path.exists(JOBS_DB_PATH):
        st.info("Downloading job postings database...")
        gdown.download(JOBS_GDRIVE_URL, JOBS_DB_PATH, quiet=False)

# Ensure FAISS index exists locally
def download_faiss_index():
    if not os.path.exists(FAISS_INDEX_PATH):
        st.info("Downloading FAISS index...")
        gdown.download(FAISS_GDRIVE_URL, FAISS_INDEX_PATH, quiet=False)

# Load hierarchical structure from majors.db
@st.cache_data
def load_major_hierarchy():
    conn = sqlite3.connect(MAJOR_DB_PATH)
    df = pd.read_sql(
        "SELECT DISTINCT School, Department, [Major Name] AS Major, [Degree Level] AS DegreeLevel FROM majors;",
        conn,
    )
    conn.close()
    return df

# Load embedding model
@st.cache_resource
def load_embedding_model():
    return SentenceTransformer(EMBEDDINGS_MODEL, truncate_dim=dimensions)

# Load FAISS index and job ID map
@st.cache_resource
def load_faiss_index():
    download_faiss_index()
    index = faiss.read_index(FAISS_INDEX_PATH)
    return index

# Generate embedding for a major
@st.cache_data
def get_major_embedding(major_display: str):
    """
    major_display is of the form "Major Name (DegreeLevel)".
    We parse out both pieces, lookup description, and encode all three.
    """
    model = load_embedding_model()

    # 1) parse the display into name & degree
    if "(" in major_display and major_display.endswith(")"):
        name, degree = major_display.rsplit("(", 1)
        name = name.strip()
        degree = degree[:-1]  # drop trailing ")"
    else:
        name, degree = major_display, ""

    # 2) fetch the rich description from majors.db
    conn = sqlite3.connect(MAJOR_DB_PATH)
    row = conn.execute(
        "SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?",
        (name, degree)
    ).fetchone()
    conn.close()
    desc = row[0] if row and row[0] else ""

    # 3) build the full prompt
    full_text = f"{name} ({degree}). {desc}"

    # 4) embed
    emb = model.encode(full_text, prompt_name="query", convert_to_numpy=True)
    emb = np.array(emb, dtype='float32')
    faiss.normalize_L2(emb.reshape(1, -1))
    return emb

@st.cache_data
def get_major_query_text(major_display: str) -> str:
    # parse out name & degree exactly like get_major_embedding
    if "(" in major_display and major_display.endswith(")"):
        name, degree = major_display.rsplit("(", 1)
        name = name.strip()
        degree = degree[:-1]
    else:
        name, degree = major_display, ""
    # fetch the same description
    conn = sqlite3.connect(MAJOR_DB_PATH)
    row = conn.execute(
        "SELECT description FROM majors WHERE [Major Name]=? AND [Degree Level]=?",
        (name, degree)
    ).fetchone()
    conn.close()
    desc = row[0] if row and row[0] else ""
    # rebuild the exact query text
    return f"{name} ({degree}). {desc}"

# Perform semantic search using FAISS
@st.cache_data
def perform_semantic_search(major_embedding, _faiss_index, k_results):
    D, I = _faiss_index.search(major_embedding.reshape(1, -1), k_results)
    results = []
    for idx64, score in zip(I[0], D[0]):
        if idx64 == -1:
            continue
        job_id = int(idx64)                # this *is* your job_id
        results.append({'job_id': job_id, 'semantic_score': float(score)})
    return pd.DataFrame(results)

# Fetch jobs by ID and calculate relevancy in Python
def get_jobs_with_semantic_scores(job_ids_with_scores):
    if job_ids_with_scores.empty:
        return pd.DataFrame()

    job_ids = job_ids_with_scores['job_id'].tolist()
    placeholders = ','.join(['?' for _ in job_ids])
    
    conn = sqlite3.connect(JOBS_DB_PATH)
    sql = f"SELECT * FROM job_postings WHERE job_id IN ({placeholders});"
    jobs_df = pd.read_sql(sql, conn, params=job_ids)
    conn.close()

    # Merge with semantic scores
    merged_df = pd.merge(jobs_df, job_ids_with_scores, left_on='job_id', right_on='job_id', how='inner')
    
    # Calculate relevancy score
    merged_df['relevancy_score'] = merged_df['semantic_score'] * SEMANTIC_SCORE_SCALE
    
    # Calculate the 25th percentile of relevancy scores
    # Only calculate if there are enough scores to make sense, otherwise use a default low threshold
    if not merged_df.empty and len(merged_df) >= 4: # Ensure at least 4 elements for 25th percentile
        percentile_threshold = np.percentile(merged_df['relevancy_score'], 25)
    else:
        percentile_threshold = 0.0 # Fallback to a very low threshold if not enough data

    # Filter by the dynamic percentile threshold
    filtered_df = merged_df[merged_df['relevancy_score'] >= percentile_threshold]
    
    # Sort by relevancy score and limit to MAX_JOB_POSTINGS_FETCH
    sorted_df = filtered_df.sort_values(by='relevancy_score', ascending=False)
    
    return sorted_df.head(MAX_JOB_POSTINGS_FETCH)


# Run query on jobs.db
def query_jobs(sql_query, params):
    conn = sqlite3.connect(JOBS_DB_PATH)
    df = pd.read_sql(sql_query, conn, params=params)
    conn.close()
    return df

# Streamlit UI
st.set_page_config(
    page_title="Major-to-Job Explorer", 
    layout="centered",
    initial_sidebar_state="expanded"
)
st.title("🎓 Major-to-Job Postings Explorer")

# Download job DB and FAISS index if needed
download_jobs_db()
download_faiss_index()

# Load hierarchy
hierarchy_df = load_major_hierarchy()

# Step 1: Select School
schools = sorted(hierarchy_df["School"].unique())
selected_school = st.selectbox("Select a School:", schools)

if selected_school:
    departments = sorted(hierarchy_df[hierarchy_df["School"] == selected_school]["Department"].unique())
    selected_department = st.selectbox("Select a Department:", departments)

    if selected_department:
        majors_df = hierarchy_df[
            (hierarchy_df["School"] == selected_school) &
            (hierarchy_df["Department"] == selected_department)
        ].copy()

        # Create a display string for the selectbox
        majors_df["Display"] = majors_df["Major"] + " (" + majors_df["DegreeLevel"] + ")"
        
        # Create a mapping from display string to actual major name
        major_display_to_name = dict(zip(majors_df["Display"], majors_df["Major"]))

        # Sort the display names
        display_majors = sorted(majors_df["Display"].unique())
        
        selected_major_display = st.selectbox("Select a Major:", display_majors)

        # Get the actual major name from the display name
        selected_major = major_display_to_name.get(selected_major_display)

        search_button = st.button("Search Jobs")

        if search_button and selected_major:
            # Reset pagination when a new search is initiated
            st.session_state.current_page = 0
            st.session_state.last_selected_major = selected_major

            with st.spinner("Loading semantic data..."):
                faiss_index = load_faiss_index()
            
            with st.spinner(f"Generating embedding for {selected_major}..."):
                # use the “Major Name (DegreeLevel)” that the user selected
                major_embedding = get_major_embedding(selected_major_display)

            with st.spinner("Performing semantic search..."):
                # Fetch more results initially to allow for percentile filtering
                semantic_results = perform_semantic_search(major_embedding, faiss_index, MAX_JOB_POSTINGS_FETCH * 4)
                st.session_state.search_results = get_jobs_with_semantic_scores(semantic_results)

            if not st.session_state.search_results.empty:
                st.success(f"Search complete! Found {len(st.session_state.search_results)} relevant job postings.")
            else:
                st.warning("No relevant job postings found for this major.")
                st.session_state.search_results = pd.DataFrame()

        # Display results if they exist in session state
        if 'search_results' in st.session_state and not st.session_state.search_results.empty:
            results = st.session_state.search_results
            current_major_display = st.session_state.get('last_selected_major', 'Selected Major')
            
            # ── Cross‐Encoder Rerank using major description ──
            cross_encoder  = CrossEncoder(RERANK_MODEL)

            # 1) Grab the exact same query text we used for FAISS
            query_text = get_major_query_text(selected_major_display)

            # 2) Build (query_text, job_desc) pairs
            pairs = [(query_text, jd) for jd in results["description"].tolist()]

            # 3) Cross-encode as before
            cross_scores = cross_encoder.predict(pairs)
            results["cross_score"] = cross_scores
            results = results.sort_values("cross_score", ascending=False).reset_index(drop=True)

            # 4) (Optional) Truncate to top‐N for display
            TOP_N = st.sidebar.slider("Results to show", 5, 100, 50)
            results = results.head(TOP_N).copy()
            
            # ── Dynamic Role Clustering ──
            # 1) Re-encode titles into embeddings
            model  = load_embedding_model()
            titles = results["title"].tolist()
            embs   = model.encode(titles, convert_to_numpy=True)   # ← this bit was missing

            # 2) Cluster into up to 8 roles
            n_roles = min(8, len(titles))
            kmeans  = KMeans(n_clusters=n_roles, random_state=0).fit(embs)
            results["cluster_id"] = kmeans.labels_

            # 3) Build human-readable names
            centroids = kmeans.cluster_centers_
            role_names = []
            for cid, center in enumerate(centroids):
                idxs = np.where(results["cluster_id"] == cid)[0]      # positional indices
                cluster_embs = embs[idxs]
                # get the positional index of the closest embedding
                winner_pos = idxs[np.argmin(np.linalg.norm(cluster_embs - center, axis=1))]
                # use iloc to fetch by positional index
                role_names.append(results.iloc[winner_pos]["title"])

            # 4) Map into new column
            cluster_to_role = {i: name for i, name in enumerate(role_names)}
            results["role_name"] = results["cluster_id"].map(cluster_to_role)
            
            # ----------Beginning of "Visualization" section-----------------
            viz = st.sidebar.selectbox(
                "Choose a visualization",
                ["None", "Word Cloud", "Top-10 Bar Chart", "Treemap"],
                index = 2
            )
            if viz == "None":
                st.info("No visualization selected. Use the sidebar to choose one.")
            else:
                st.header("🔍 At-a-Glance: Top Job Roles")
                
                if viz == "Word Cloud":
                    # Sum relevancy by role
                    role_weights = (
                        results
                        .groupby("role_name")["relevancy_score"]
                        .sum()
                        .to_dict()
                    )

                    # Generate cloud
                    wc = WordCloud(
                        width=800, height=400,
                        background_color="white",
                        max_words=50
                    ).generate_from_frequencies({r: int(s*100) for r, s in role_weights.items()})

                    st.subheader("Role-Level Word Cloud")
                    st.image(wc.to_array(), use_container_width=True)

                elif viz == "Top-10 Bar Chart":
                    # Let user pick metric
                    metric = st.sidebar.radio("Rank by:", ["Count", "Avg Relevancy"])
                    field  = "count" if metric=="Count" else "avg_rel"

                    # Aggregate on role_name
                    df_role = (
                        results
                        .groupby("role_name")
                        .agg(count=("role_name","size"), avg_rel=("relevancy_score","mean"))
                        .reset_index()
                        .sort_values(field, ascending=False)
                        .head(10)
                    )

                    chart = (
                        alt.Chart(df_role)
                        .mark_bar()
                        .encode(
                            x=alt.X(f"{field}:Q", title=metric),
                            y=alt.Y("role_name:N", sort='-x', title="Role"),
                            tooltip=["role_name","count","avg_rel"]
                        )
                        .properties(title=f"Top-10 Roles by {metric}", height=400)
                    )

                    st.altair_chart(chart, use_container_width=True)

                    with st.expander("View Data Table"):
                        st.table(
                            df_role.rename(columns={
                            "role_name":"Role","count":"Count","avg_rel":"Avg. Relevancy"
                            })
                        )

                elif viz == "Treemap":
                    # 1) Prepare a two-level DataFrame
                    df_tree = (
                        results
                        .groupby(["role_name", "title"])
                        .agg(
                            count=("title", "size"),
                            avg_rel=("relevancy_score", "mean")
                        )
                        .reset_index()
                    )
                    
                    # 2) Prune children: keep top 5 titles per role, aggregate the rest
                    def prune_children(df, top_n=5):
                        pieces = []
                        for role, grp in df.groupby("role_name"):
                            # pick the top N by count
                            top = grp.nlargest(top_n, "count")
                            rest = grp.drop(top.index)
                            pieces.append(top)
                            if not rest.empty:
                                pieces.append(pd.DataFrame({
                                    "role_name": [role],
                                    "title":       ["Other Titles"],
                                    "count":       [rest["count"].sum()],
                                    "avg_rel":     [rest["avg_rel"].mean()]
                                }))
                        return pd.concat(pieces, ignore_index=True)
                    
                    # apply pruning
                    df_tree = prune_children(df_tree, top_n=5)

                    # 3) Build a treemap showing both levels at once
                    fig = px.treemap(
                        df_tree,
                        path=["role_name", "title"],     # level-0=role_name, level-1=title
                        values="count",
                        color="avg_rel",
                        color_continuous_scale="Viridis",
                        hover_data=["count", "avg_rel"],
                        title="Jobs Treemap (Roles → Titles)",
                        maxdepth=2                       # always draw both levels
                    )

                    # 4) Improve padding & fonts for clarity
                    fig.update_traces(
                        tiling=dict(pad=3),             # inner padding
                        outsidetextfont=dict(size=18, color="white"),  # role labels
                        insidetextfont=dict(size=12, color="white"),   # title labels
                        textinfo="label+value"          # show name + count on each rectangle
                    )

                    # 5) Add breathing room and a clear colorbar title
                    fig.update_layout(
                        margin=dict(t=50, l=25, r=25, b=25),
                        coloraxis_colorbar=dict(title="Avg. Relevancy")
                    )

                    st.plotly_chart(fig, use_container_width=True)
                    
                # -----------------End of "Visualization" section-----------------



            st.subheader(f"Job Postings for: {current_major_display}")
            st.write("Results are ranked by semantic relevancy.")

            # Pagination setup
            JOBS_PER_PAGE = 10
            if 'current_page' not in st.session_state:
                st.session_state.current_page = 0

            total_jobs = len(results)
            total_pages = (total_jobs + JOBS_PER_PAGE - 1) // JOBS_PER_PAGE

            start_index = st.session_state.current_page * JOBS_PER_PAGE
            end_index = min(start_index + JOBS_PER_PAGE, total_jobs)

            results_page = results.iloc[start_index:end_index]

            # Display navigation buttons
            nav_cols = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True)
            with nav_cols[0]:
                if st.session_state.current_page > 0:
                    if st.button("Previous"):
                        st.session_state.current_page -= 1
                        st.rerun()
            with nav_cols[0]:
                if st.session_state.current_page > 0:
                    if st.button("First Page"):
                        st.session_state.current_page = 0
                        st.rerun()
            with nav_cols[1]:
                # Page number selector
                page_options = [i + 1 for i in range(total_pages)]
                selected_page_display = st.selectbox(
                    "Go to Page:",
                    options=page_options,
                    index=st.session_state.current_page,
                    key="page_selector"
                )
                # Update current_page if selection changes
                if selected_page_display - 1 != st.session_state.current_page:
                    st.session_state.current_page = selected_page_display - 1
                    st.rerun()
            with nav_cols[2]:
                if st.session_state.current_page < total_pages - 1:
                    if st.button("Next"):
                        st.session_state.current_page += 1
                        st.rerun()
            with nav_cols[2]:
                if st.session_state.current_page < total_pages - 1:
                    if st.button("Last Page"):
                        st.session_state.current_page = total_pages - 1
                        st.rerun()

            st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}")

            if results_page.empty:
                st.info("No job postings found for this page.")
            else:
                for index, row in results_page.iterrows():
                    st.subheader(f"{row['title']} at {row['company_name']}")
                    st.write(f"**Location:** {row['location']} | **Experience Level:** {row['formatted_experience_level']} | **Relevancy Score:** {row['relevancy_score']:.2f}")

                    with st.expander("View Details"):
                        st.write(f"**Description:**")
                        st.markdown(row['description'])

                        if pd.notna(row['skills_desc']) and row['skills_desc']:
                            st.write(f"**Skills:**")
                            st.markdown(row['skills_desc'])

                        st.write(f"**Listed Time:** {row['listed_time']}")
                        st.write(f"**Work Type:** {row['formatted_work_type']}")
                        st.write(f"**Remote Allowed:** {'Yes' if row['remote_allowed'] else 'No'}")

                        salary_info = []
                        if pd.notna(row['min_salary']) and pd.notna(row['max_salary']):
                            salary_info.append(f"{row['currency']} {row['min_salary']:.2f} - {row['max_salary']:.2f} {row['pay_period']}")
                        elif pd.notna(row['normalized_salary']):
                            salary_info.append(f"Normalized Salary: {row['currency']} {row['normalized_salary']:.2f}")
                        if salary_info:
                            st.write(f"**Salary:** {', '.join(salary_info)}")
                        else:
                            st.write("**Salary:** Not specified")

                        if pd.notna(row['job_posting_url']) and row['job_posting_url']:
                            st.markdown(f"**Job Posting URL:** [Link]({row['job_posting_url']})")
                        if pd.notna(row['application_url']) and row['application_url']:
                            st.markdown(f"**Application URL:** [Link]({row['application_url']})")

                        st.write(f"**Views:** {row['views']} | **Applies:** {row['applies']}")
                    st.markdown("---")
            nav_cols_top = st.columns([1, 1, 1], vertical_alignment='center', gap='large', border=True)
            with nav_cols_top[0]:
                if st.session_state.current_page > 0:
                    if st.button("Previous", key="prev_top"):
                        st.session_state.current_page -= 1
                        st.rerun()
            with nav_cols_top[0]:
                if st.session_state.current_page > 0:
                    if st.button("First Page", key="first_top"):
                        st.session_state.current_page = 0
                        st.rerun()
            with nav_cols_top[1]:
                page_options = [i + 1 for i in range(total_pages)]
                selected_page_display_top = st.selectbox(
                    "Go to Page:",
                    options=page_options,
                    index=st.session_state.current_page,
                    key="page_selector_top"
                )
                if selected_page_display_top - 1 != st.session_state.current_page:
                    st.session_state.current_page = selected_page_display_top - 1
                    st.rerun()
            with nav_cols_top[2]:
                if st.session_state.current_page < total_pages - 1:
                    if st.button("Next", key="next_top"):
                        st.session_state.current_page += 1
                        st.rerun()
            with nav_cols_top[2]:
                if st.session_state.current_page < total_pages - 1:
                    if st.button("Last Page", key="last_top"):
                        st.session_state.current_page = total_pages - 1
                        st.rerun()

            st.write(f"Displaying jobs {start_index + 1}-{end_index} of {total_jobs}")

            if not results.empty:
                st.download_button(
                    "Download Results as CSV",
                    data=results.to_csv(index=False),
                    file_name="job_results.csv",
                    mime="text/csv"
                )