Spaces:

abdulllah01
/

articles_from_sitemap

Sleeping

App Files Files Community

articles_from_sitemap / app.py

abdulllah01

Create app.py

f9d793f verified 30 days ago

raw

history blame contribute delete

4.39 kB

	import streamlit as st
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup

	# Helper Functions
	COMMON_SITEMAP_LOCATIONS = [
	"/sitemap.xml",
	"/sitemap_index.xml",
	"/sitemap-index.xml",
	"/sitemap.php",
	"/sitemap.txt",
	"/sitemap.xml.gz",
	"/sitemap/",
	"/sitemap/sitemap.xml",
	"/sitemapindex.xml",
	"/sitemap/index.xml",
	"/sitemap1.xml",
	"/rss/",
	"/rss.xml",
	"/atom.xml",
	]


	def find_sitemap(domain):
	"""Locate the sitemap URL by checking common locations and robots.txt."""
	for path in COMMON_SITEMAP_LOCATIONS:
	sitemap_url = domain.rstrip("/") + path
	try:
	response = requests.get(sitemap_url, timeout=5)
	if response.status_code == 200:
	return sitemap_url
	except requests.RequestException:
	continue

	robots_url = domain.rstrip("/") + "/robots.txt"
	try:
	response = requests.get(robots_url, timeout=5)
	if response.status_code == 200:
	for line in response.text.splitlines():
	if line.lower().startswith("sitemap:"):
	return line.split(":", 1)[1].strip()
	except requests.RequestException:
	pass
	return None


	def get_sitemap_links(sitemap_url):
	"""Fetch all links from a sitemap."""
	response = requests.get(sitemap_url)
	if response.status_code != 200:
	raise Exception(f"Failed to fetch sitemap: {response.status_code}")

	soup = BeautifulSoup(response.content, "lxml-xml")
	links = [loc.text.strip() for loc in soup.find_all("loc")]
	return links


	def is_blog_link(link):
	"""Filter links for blog-related URLs."""
	return "blog" in link.lower()


	def extract_article_info(url):
	"""Extract the article content from a URL."""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	response.encoding = response.apparent_encoding
	soup = BeautifulSoup(response.text, "html.parser")

	heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
	all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
	article_text = "\n\n".join(all_paragraphs)
	full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
	return full_article_text
	except Exception as e:
	return f"Error fetching article: {e}"


	# Streamlit App
	st.title("Blog Article Scraper")

	# Input Website URL
	website_url = st.text_input("Enter the website URL (e.g., https://example.com):")

	if st.button("Start Scraping"):
	if not website_url:
	st.error("Please enter a website URL.")
	else:
	st.write("Locating sitemap...")
	sitemap_url = find_sitemap(website_url)

	if not sitemap_url:
	st.error("Could not locate a sitemap.")
	else:
	st.success(f"Sitemap found: {sitemap_url}")
	st.write("Fetching links from sitemap...")
	try:
	links = get_sitemap_links(sitemap_url)
	blog_links = [link for link in links if is_blog_link(link)]
	st.success(f"Found {len(blog_links)} blog links.")

	# Scraping articles
	st.write("Extracting article content...")
	data = []
	progress = st.progress(0)
	for i, link in enumerate(blog_links):
	article_text = extract_article_info(link)
	data.append({"URL": link, "Article Text": article_text})
	progress.progress((i + 1) / len(blog_links))

	# Save results to DataFrame and Display
	df = pd.DataFrame(data)
	st.write("Scraping completed.")
	st.dataframe(df)

	# Provide download link for Excel file
	file_name = "blog_articles.xlsx"
	df.to_excel(file_name, index=False)
	with open(file_name, "rb") as file:
	st.download_button(
	label="Download Excel File",
	data=file,
	file_name="blog_articles.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	)
	except Exception as e:
	st.error(f"Error during scraping: {e}")