Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

LOC-Metadate-Analyzer / app.py

CCockrum

Update app.py

8f294a5 verified 8 months ago

raw

history blame contribute delete

20.4 kB

	import os
	import requests
	import pandas as pd
	import streamlit as st
	import time
	import matplotlib
	import plotly.express as px
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	def is_missing(value):
	return pd.isna(value) or str(value).strip() == ""

	# Load the Hugging Face API key from environment
	api_key = os.getenv('HF_API')

	def get_huggingface_suggestions(title, description):
	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
	headers = {"Authorization": f"Bearer {api_key}"}

	full_text = f"{title}. {description}".strip()

	if not full_text:
	return None

	candidate_labels = [
	"History", "Politics", "Science", "Technology", "Art", "Literature",
	"Education", "Economics", "Military", "Geography", "Sociology",
	"Philosophy", "Religion", "Law", "Medicine", "Engineering",
	"Mathematics", "Computer Science", "Agriculture", "Environment",
	"Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
	]

	payload = {
	"inputs": full_text,
	"parameters": {
	"candidate_labels": candidate_labels,
	"multi_label": True
	}
	}

	try:
	response = requests.post(API_URL, headers=headers, json=payload)
	result = response.json()

	if "error" in result:
	st.error(f"API error: {result['error']}")
	return None

	labels = [
	label for label, score in zip(result.get("labels", []), result.get("scores", []))
	if score > 0.3
	]

	return ", ".join(labels) if labels else None

	except Exception as e:
	st.error(f"API Error: {e}")
	return None

	# Custom CSS
	st.markdown("""
	<style>

	.main {
	background-color: #1A1A1A !important; /* dark */
	color: #D3D3D3 !important;
	}

	}
	.block-container {
	background-color: #D3D3D3 !important;
	color: #cccccc !important;
	padding-left: 3rem !important;
	padding-right: 3rem !important;
	max-width: 900px; /* widen main feed */
	margin: auto; /* center it */
	}
	/* Headings */
	h1, h2, h3, h4 {
	color: #eeeeee !important; /* brighter light gray for headings */
	font-weight: 700 !important; /* bold */
	margin-bottom: 1rem !important;
	}
	p, span, div {
	color: #cccccc !important;
	}
	/* Subheaders (optional) */
	.stSubheader {
	color: #dddddd !important;
	font-size: 1.4rem !important;
	}
	/* Dataframes (optional tweak) */
	.stDataFrame {
	background-color: #2e2e2e !important;
	border-radius: 10px;
	padding: 1rem;
	}
	section[data-testid="stSidebar"] > div:first-child {
	background-color: #808080 !important;
	padding: 1rem;
	border-radius: 0.5rem;
	color: #808080 !important;
	}
	.stMarkdown, .stTextInput, .stDataFrame {
	color: #1A1A1A!important;
	}
	img.banner {
	width: 100%;
	border-radius: 12px;
	margin-bottom: 1rem;
	}
	.stAlert {
	background-color: #f0f0f5 !important;
	color: #1A1A1A !important;
	padding: 1.25rem !important;
	font-size: 1rem !important;
	border-radius: 0.5rem !important;
	box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
	}
	header[data-testid="stHeader"] {
	background-color: #1A1A1A !important;
	}

	section[data-testid="stSidebar"] > div:first-child {
	background-color: #1A1A1A !important;
	color: #FFFFFF !important;
	padding: 2rem 1.5rem 1.5rem 1.5rem !important;
	border-radius: 12px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	font-size: 0.95rem;
	line-height: 1.5;
	}
	;
	html, body, [data-testid="stApp"] {
	background-color: #1A1A1A !important;
	}
	.custom-table {
	background-color: #D3D3D3;
	color: #1A1A1A;
	font-family: monospace;
	padding: 1rem;
	border-radius: 8px;
	overflow-x: auto;
	white-space: pre;
	border: 1px solid #ccc;

	}
	.sidebar-stats {
	color: lightgray !important;
	font-size: 1.1rem !important;
	margin-top: 1.5rem;
	font-weight: 600;
	}
	.sidebar-contrast-block {
	background-color: #2b2b2b !important;
	padding: 1.25rem;
	border-radius: 10px;
	margin-top: 1.5rem;
	}
	section.main > div { /* widen main container */
	max-width: 95%;
	padding-left: 3rem;
	padding-right: 3rem;

	}

	</style>
	""", unsafe_allow_html=True)

	# Function to get subject suggestions using Hugging Face API
	def get_huggingface_suggestions(title, description):
	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
	# Rest of the function code...

	# Use an image from a URL for the banner
	st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

	# Streamlit app header
	st.title("MetaDiscovery Agent for Library of Congress Collections")
	st.markdown("""
	This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
	an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
	""")

	# Updated collection URLs using the correct LOC API
	collections = {
	"American Revolutionary War Maps": "american+revolutionary+war+maps",
	"Civil War Maps": "civil+war+maps",
	"Women's Suffrage": "women+suffrage",
	"World War I Posters": "world+war+posters"
	}

	# Sidebar for selecting collection
	#st.sidebar.markdown("## Settings")

	# Create empty metadata_df variable to ensure it exists before checking
	metadata_df = pd.DataFrame()

	# Add a key to the selectbox to ensure it refreshes properly
	with st.sidebar:
	st.markdown("""
	<div style='
	background-color: #2b2b2b
	padding: 1.5rem;
	border-radius: 12px;
	margin-bottom: 1.5rem;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	'>
	""", unsafe_allow_html=True)

	selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")

	st.markdown("</div>", unsafe_allow_html=True)

	search_query = collections[selected]

	# Define the collection URL
	collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

	# Create an empty placeholder for Quick Stats
	stats_placeholder = st.sidebar.empty()


	# Add a fetch button to make the action explicit
	fetch_data = True

	if fetch_data:
	# Display a loading spinner while fetching data
	with st.spinner(f"Fetching data for {selected}..."):
	# Fetch data from LOC API with spoofed User-Agent header
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
	}

	try:
	response = requests.get(collection_url, headers=headers)
	response.raise_for_status()
	data = response.json()

	if "results" in data:
	records = data.get("results", [])
	elif "items" in data:
	records = data.get("items", [])
	else:
	records = []
	st.error("Unexpected API response structure. No records found.")
	st.write(f"Retrieved {len(records)} records")

	except requests.exceptions.RequestException as e:
	st.error(f"API Connection Error: {e}")
	records = []
	except ValueError:
	st.error("Failed to parse API response as JSON")
	records = []

	# Extract selected metadata fields
	items = []
	for record in records:
	if isinstance(record, dict):
	description = record.get("description", "")
	if isinstance(description, list):
	description = " ".join([str(d) for d in description])
	item = {
	"id": record.get("id", ""),
	"title": record.get("title", ""),
	"date": record.get("date", ""),
	"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
	"creator": record.get("creator", ""),
	"description": description
	}
	if not item["title"] and "item" in record:
	item["title"] = record.get("item", {}).get("title", "")
	if not item["date"] and "item" in record:
	item["date"] = record.get("item", {}).get("date", "")
	items.append(item)

	metadata_df = pd.DataFrame(items)

	# Missing field detection
	fields_to_check = ["subject", "creator", "date", "title", "description"]
	missing_counts = {}

	for field in fields_to_check:
	if field in metadata_df.columns:
	missing = metadata_df[field].apply(is_missing)
	missing_counts[field] = missing.sum()

	# Define custom completeness check
	def is_incomplete(value):
	return pd.isna(value) or value in ["", "N/A", "null", None]


	if not metadata_df.empty:
	# --- Unified Completeness and Missing Fields Analysis ---

	#Define incompleteness at the cell level
	is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]

	#Create a mask for missing values
	missing_mask = metadata_df.map(is_incomplete)

	#Compute overall record-level completeness
	incomplete_count = missing_mask.any(axis=1).sum()
	total_fields = metadata_df.size
	filled_fields = (~missing_mask).sum().sum()
	overall_percent = (filled_fields / total_fields) * 100

	#Field-specific missing counts (for Missing Metadata Summary)
	missing_counts = missing_mask.sum().sort_values(ascending=False)
	missing_df = (
	pd.DataFrame(missing_counts)
	.reset_index()
	.rename(columns={"index": "Field", 0: "Missing Count"})
	)


	# Field-level completeness
	completeness = (~metadata_df.map(is_incomplete)).mean() * 100
	completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
	completeness_table = completeness_df.set_index("Field")

	# Sidebar Quick Stats
	quick_stats = pd.DataFrame({
	"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
	"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
	})

	styled_quick_stats = (
	quick_stats.style
	.hide(axis="index")
	.background_gradient(cmap="Oranges", subset=["Value"])
	.format({"Value": "{:.1f}"})
	)

	# Add an expander and put the dataframe inside it
	with st.sidebar.expander("Quick Stats", expanded=True):
	st.dataframe(
	styled_quick_stats,
	use_container_width=True,
	hide_index=True
	)

	# Sidebar: Metadata Missing Stats
	missing_df = (
	pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"])
	.sort_values(by="Missing Count", ascending=False)
	.reset_index(drop=True)
	)

	styled_missing_df = (
	missing_df.style
	.background_gradient(cmap="Blues", subset=["Missing Count"])
	.hide(axis="index")
	)

	with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True):
	st.dataframe(
	styled_missing_df,
	use_container_width=True,
	hide_index=True, # <<< ADD THIS
	height=min(300, len(missing_df) * 35 + 38)
	)

	# Calculate Top 10 Subjects
	if 'subject' in metadata_df.columns:
	top_subjects = (
	metadata_df['subject']
	.dropna()
	.str.split(',')
	.explode()
	.str.strip()
	.value_counts()
	.head(10)
	.to_frame(name="Count")
	)

	#Most Common Subjects in Sidebar
	with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
	st.dataframe(
	top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
	use_container_width=True,
	height=240
	)

	with st.sidebar.expander("Helpful Resources", expanded=False):
	st.markdown("""
	<style>
	.sidebar-links a {
	color: lightgray !important;
	text-decoration: none !important;
	}
	.sidebar-links a:hover {
	text-decoration: underline !important;
	}
	</style>
	<div class="sidebar-links">
	<ul style='padding-left: 1em'>
	<li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
	<li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
	<li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
	<li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
	<li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
	</ul>
	</div>
	""", unsafe_allow_html=True)

	# Utility functions for deeper metadata quality analysis
	def is_incomplete(value):
	return pd.isna(value) or value in ["", "N/A", "null", None]

	def is_valid_date(value):
	try:
	pd.to_datetime(value)
	return True
	except:
	return False

	if not metadata_df.empty:
	st.subheader("Retrieved Metadata Sample")
	st.dataframe(metadata_df.head())


	st.subheader("Field Completeness Breakdown")

	#DARK box for the Field Completeness Breakdown (MATCH others!)
	st.markdown("""
	<div style='
	background-color: #2e2e2e;
	padding: 1.5rem;
	border-radius: 10px;
	margin-top: 1.5rem;
	color: lightgray;
	'>
	""", unsafe_allow_html=True)

	#Dataframe inside the dark box
	st.dataframe(
	completeness_table.style
	.background_gradient(cmap="Greens")
	.format("{:.0f}%")
	.hide(axis="index"),
	use_container_width=True,
	height=240
	)

	st.markdown("</div>", unsafe_allow_html=True)

	# Identify incomplete records
	incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
	incomplete_records = metadata_df[incomplete_mask]


	# --- Suggested Metadata Enhancements Section ---
	st.subheader("Suggested Metadata Enhancements")

	# Create a row with checkbox for AI suggestions - with proper label
	use_ai = st.checkbox("Use AI Suggestions", value=True, label_visibility="hidden")
	st.markdown("🤖 Use AI Suggestions (Hugging Face)")

	# Check if records exist
	incomplete_with_desc = metadata_df[
	(metadata_df['description'].notnull() \| metadata_df['title'].notnull()) &
	(metadata_df['subject'].isnull())
	]

	if not incomplete_with_desc.empty:
	if use_ai:
	suggestions = []
	records_to_process = min(10, len(incomplete_with_desc))
	progress = st.progress(0)
	status = st.empty()

	for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
	if i >= records_to_process:
	break
	title = row['title'] if pd.notna(row['title']) else ""
	description = row['description'] if pd.notna(row['description']) else ""
	status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
	suggested_subject = get_huggingface_suggestions(title, description)
	if suggested_subject:
	suggestions.append((title, suggested_subject))
	progress.progress((i + 1) / records_to_process)

	status.empty()
	progress.empty()

	if suggestions:
	suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])

	# Create a custom dark-styled HTML table instead
	html_table = """
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem;">
	<table style="width: 100%; border-collapse: collapse; color: #e0e0e0;">
	<thead>
	<tr style="border-bottom: 1px solid #444;">
	<th style="padding: 12px; text-align: left; color: #e0e0e0;">Title</th>
	<th style="padding: 12px; text-align: left; color: #e0e0e0;">Suggested Subject</th>
	</tr>
	</thead>
	<tbody>
	"""

	for _, row in suggestions_df.iterrows():
	title = row['Title']
	title_display = title[:50] + "..." if len(title) > 50 else title
	subject = row['Suggested Subject']

	# Calculate a shade of green based on confidence or some other metric
	# For demonstration, using a fixed green shade
	green_shade = "rgba(0, 100, 0, 0.3)"

	html_table += f"""
	<tr style="border-bottom: 1px solid #444;">
	<td style="padding: 12px; text-align: left;">{title_display}</td>
	<td style="padding: 12px; text-align: left; background-color: {green_shade};">{subject}</td>
	</tr>
	"""

	html_table += """
	</tbody>
	</table>
	</div>
	"""

	st.markdown(html_table, unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	No metadata enhancement suggestions available.
	</div>
	""", unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	Enable AI Suggestions to view recommendations.
	</div>
	""", unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	All records already have subjects or no usable text available.
	</div>
	""", unsafe_allow_html=True)