Spaces:

ND06-25
/

Slash

Sleeping

App Files Files Community

Slash / app.py

ND06-25

Fix min length slider

267f1ae 14 days ago

raw

history blame contribute delete

5.97 kB

	import os
	import streamlit as st
	from typing import Dict, Any

	from api.pdf_processor import PDFProcessor
	from api.summarizer import BookSummarizer

	DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "sshleifer/distilbart-cnn-12-6")
	AVAILABLE_MODELS = BookSummarizer(DEFAULT_MODEL).get_available_models()


	st.set_page_config(
	page_title="Book Summarizer",
	page_icon="📚",
	layout="wide",
	initial_sidebar_state="expanded",
	)


	@st.cache_resource
	def get_pdf_processor() -> PDFProcessor:
	return PDFProcessor()


	@st.cache_resource
	def get_summarizer(model_name: str) -> BookSummarizer:
	summarizer = BookSummarizer(model_name=model_name)
	summarizer.load_model()
	return summarizer


	def summarize_pdf(
	uploaded_file,
	model_name: str,
	max_length: int,
	min_length: int,
	chunk_size: int,
	overlap: int,
	) -> Dict[str, Any]:
	pdf_bytes = uploaded_file.getvalue()
	processor = get_pdf_processor()

	validation = processor.validate_pdf(pdf_bytes)
	if not validation["valid"]:
	raise ValueError(validation["message"])

	metadata = processor.get_pdf_metadata(pdf_bytes)
	extraction = processor.extract_text_from_pdf(pdf_bytes)
	if not extraction["success"]:
	raise RuntimeError(extraction["message"])

	summarizer = get_summarizer(model_name)
	summary_result = summarizer.summarize_book(
	text=extraction["text"],
	chunk_size=chunk_size,
	overlap=overlap,
	max_length=max_length,
	min_length=min_length,
	)

	if not summary_result["success"]:
	raise RuntimeError(summary_result.get("error", "Summarization failed"))

	return {
	"metadata": metadata,
	"validation": validation,
	"extraction": extraction,
	"summary": summary_result,
	}


	def sidebar_controls():
	st.header("Settings")

	model_names = [m["name"] for m in AVAILABLE_MODELS]
	model_descriptions = {m["name"]: m["description"] for m in AVAILABLE_MODELS}

	selected_model = st.selectbox(
	"Model",
	model_names,
	index=model_names.index(DEFAULT_MODEL) if DEFAULT_MODEL in model_names else 0,
	help="Free, locally run Hugging Face models. First run downloads weights.",
	)
	st.caption(model_descriptions.get(selected_model, ""))

	max_length = st.slider(
	"Maximum summary length (words)",
	min_value=50,
	max_value=250,
	value=140,
	step=10,
	)
	min_length_limit = min(120, max_length - 10)
	min_length = st.slider(
	"Minimum summary length (words)",
	min_value=20,
	max_value=min_length_limit,
	value=min(50, max_length - 20),
	step=5,
	)

	chunk_size = st.slider(
	"Chunk size (characters)",
	min_value=600,
	max_value=2000,
	value=1200,
	step=50,
	help="Longer chunks preserve context but take longer.",
	)
	overlap = st.slider(
	"Chunk overlap (characters)",
	min_value=50,
	max_value=300,
	value=120,
	step=10,
	)

	return {
	"model": selected_model,
	"max_length": max_length,
	"min_length": min_length,
	"chunk_size": chunk_size,
	"overlap": overlap,
	}


	def show_file_info(uploaded_file):
	size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
	st.info(f"Selected: {uploaded_file.name} ({size_mb:.1f} MB)")


	def show_results(result: Dict[str, Any]):
	summary_text = result["summary"]["summary"]
	stats = result["summary"]["statistics"]
	original_stats = result["extraction"]["statistics"]

	st.success("Summary ready!")

	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Pages", result["validation"]["pages"])
	col2.metric("Original words", f"{original_stats.get('total_words', 0):,}")
	col3.metric("Summary words", f"{stats.get('final_summary_length', 0):,}")
	compression = stats.get("overall_compression_ratio", 0)
	col4.metric("Compression", f"{compression:.1%}" if compression else "N/A")

	st.subheader("Summary")
	st.text_area("Generated summary", value=summary_text, height=400, label_visibility="collapsed")

	st.download_button(
	label="Download summary",
	data=summary_text.encode("utf-8"),
	file_name=f"{result['metadata'].get('title', 'summary').replace(' ', '_')}.txt",
	mime="text/plain",
	)

	st.subheader("Book snapshot")
	preview = result["extraction"]["text"][:1500]
	if len(result["extraction"]["text"]) > 1500:
	preview += " ..."
	st.text_area("First 1500 characters", value=preview, height=220, label_visibility="collapsed")


	def main():
	st.title("📚 AI-Powered Book Summarizer")
	st.write(
	"Upload a PDF (under 50MB) to generate a concise summary locally with free, open models. "
	"No paid API keys required—first run will download model weights."
	)

	st.divider()

	with st.sidebar:
	controls = sidebar_controls()

	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file:
	show_file_info(uploaded_file)
	if st.button("Generate summary", type="primary"):
	with st.spinner("Extracting text and generating summary..."):
	try:
	result = summarize_pdf(
	uploaded_file=uploaded_file,
	model_name=controls["model"],
	max_length=controls["max_length"],
	min_length=controls["min_length"],
	chunk_size=controls["chunk_size"],
	overlap=controls["overlap"],
	)
	show_results(result)
	except Exception as exc:
	st.error(f"Could not summarize this PDF: {exc}")
	else:
	st.info("Upload a small/medium PDF to get started. Scans or image-only PDFs will not work well.")


	if __name__ == "__main__":
	main()