Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
# Helper Functions | |
COMMON_SITEMAP_LOCATIONS = [ | |
"/sitemap.xml", | |
"/sitemap_index.xml", | |
"/sitemap-index.xml", | |
"/sitemap.php", | |
"/sitemap.txt", | |
"/sitemap.xml.gz", | |
"/sitemap/", | |
"/sitemap/sitemap.xml", | |
"/sitemapindex.xml", | |
"/sitemap/index.xml", | |
"/sitemap1.xml", | |
"/rss/", | |
"/rss.xml", | |
"/atom.xml", | |
] | |
def find_sitemap(domain): | |
"""Locate the sitemap URL by checking common locations and robots.txt.""" | |
for path in COMMON_SITEMAP_LOCATIONS: | |
sitemap_url = domain.rstrip("/") + path | |
try: | |
response = requests.get(sitemap_url, timeout=5) | |
if response.status_code == 200: | |
return sitemap_url | |
except requests.RequestException: | |
continue | |
robots_url = domain.rstrip("/") + "/robots.txt" | |
try: | |
response = requests.get(robots_url, timeout=5) | |
if response.status_code == 200: | |
for line in response.text.splitlines(): | |
if line.lower().startswith("sitemap:"): | |
return line.split(":", 1)[1].strip() | |
except requests.RequestException: | |
pass | |
return None | |
def get_sitemap_links(sitemap_url): | |
"""Fetch all links from a sitemap.""" | |
response = requests.get(sitemap_url) | |
if response.status_code != 200: | |
raise Exception(f"Failed to fetch sitemap: {response.status_code}") | |
soup = BeautifulSoup(response.content, "lxml-xml") | |
links = [loc.text.strip() for loc in soup.find_all("loc")] | |
return links | |
def is_blog_link(link): | |
"""Filter links for blog-related URLs.""" | |
return "blog" in link.lower() | |
def extract_article_info(url): | |
"""Extract the article content from a URL.""" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
response.encoding = response.apparent_encoding | |
soup = BeautifulSoup(response.text, "html.parser") | |
heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None | |
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")] | |
article_text = "\n\n".join(all_paragraphs) | |
full_article_text = f"{heading}\n\n{article_text}" if heading else article_text | |
return full_article_text | |
except Exception as e: | |
return f"Error fetching article: {e}" | |
# Streamlit App | |
st.title("Blog Article Scraper") | |
# Input Website URL | |
website_url = st.text_input("Enter the website URL (e.g., https://example.com):") | |
if st.button("Start Scraping"): | |
if not website_url: | |
st.error("Please enter a website URL.") | |
else: | |
st.write("Locating sitemap...") | |
sitemap_url = find_sitemap(website_url) | |
if not sitemap_url: | |
st.error("Could not locate a sitemap.") | |
else: | |
st.success(f"Sitemap found: {sitemap_url}") | |
st.write("Fetching links from sitemap...") | |
try: | |
links = get_sitemap_links(sitemap_url) | |
blog_links = [link for link in links if is_blog_link(link)] | |
st.success(f"Found {len(blog_links)} blog links.") | |
# Scraping articles | |
st.write("Extracting article content...") | |
data = [] | |
progress = st.progress(0) | |
for i, link in enumerate(blog_links): | |
article_text = extract_article_info(link) | |
data.append({"URL": link, "Article Text": article_text}) | |
progress.progress((i + 1) / len(blog_links)) | |
# Save results to DataFrame and Display | |
df = pd.DataFrame(data) | |
st.write("Scraping completed.") | |
st.dataframe(df) | |
# Provide download link for Excel file | |
file_name = "blog_articles.xlsx" | |
df.to_excel(file_name, index=False) | |
with open(file_name, "rb") as file: | |
st.download_button( | |
label="Download Excel File", | |
data=file, | |
file_name="blog_articles.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
) | |
except Exception as e: | |
st.error(f"Error during scraping: {e}") | |