abdulllah01's picture
Create app.py
f9d793f verified
import streamlit as st
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Helper Functions
COMMON_SITEMAP_LOCATIONS = [
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/sitemap.php",
"/sitemap.txt",
"/sitemap.xml.gz",
"/sitemap/",
"/sitemap/sitemap.xml",
"/sitemapindex.xml",
"/sitemap/index.xml",
"/sitemap1.xml",
"/rss/",
"/rss.xml",
"/atom.xml",
]
def find_sitemap(domain):
"""Locate the sitemap URL by checking common locations and robots.txt."""
for path in COMMON_SITEMAP_LOCATIONS:
sitemap_url = domain.rstrip("/") + path
try:
response = requests.get(sitemap_url, timeout=5)
if response.status_code == 200:
return sitemap_url
except requests.RequestException:
continue
robots_url = domain.rstrip("/") + "/robots.txt"
try:
response = requests.get(robots_url, timeout=5)
if response.status_code == 200:
for line in response.text.splitlines():
if line.lower().startswith("sitemap:"):
return line.split(":", 1)[1].strip()
except requests.RequestException:
pass
return None
def get_sitemap_links(sitemap_url):
"""Fetch all links from a sitemap."""
response = requests.get(sitemap_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch sitemap: {response.status_code}")
soup = BeautifulSoup(response.content, "lxml-xml")
links = [loc.text.strip() for loc in soup.find_all("loc")]
return links
def is_blog_link(link):
"""Filter links for blog-related URLs."""
return "blog" in link.lower()
def extract_article_info(url):
"""Extract the article content from a URL."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, "html.parser")
heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
article_text = "\n\n".join(all_paragraphs)
full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
return full_article_text
except Exception as e:
return f"Error fetching article: {e}"
# Streamlit App
st.title("Blog Article Scraper")
# Input Website URL
website_url = st.text_input("Enter the website URL (e.g., https://example.com):")
if st.button("Start Scraping"):
if not website_url:
st.error("Please enter a website URL.")
else:
st.write("Locating sitemap...")
sitemap_url = find_sitemap(website_url)
if not sitemap_url:
st.error("Could not locate a sitemap.")
else:
st.success(f"Sitemap found: {sitemap_url}")
st.write("Fetching links from sitemap...")
try:
links = get_sitemap_links(sitemap_url)
blog_links = [link for link in links if is_blog_link(link)]
st.success(f"Found {len(blog_links)} blog links.")
# Scraping articles
st.write("Extracting article content...")
data = []
progress = st.progress(0)
for i, link in enumerate(blog_links):
article_text = extract_article_info(link)
data.append({"URL": link, "Article Text": article_text})
progress.progress((i + 1) / len(blog_links))
# Save results to DataFrame and Display
df = pd.DataFrame(data)
st.write("Scraping completed.")
st.dataframe(df)
# Provide download link for Excel file
file_name = "blog_articles.xlsx"
df.to_excel(file_name, index=False)
with open(file_name, "rb") as file:
st.download_button(
label="Download Excel File",
data=file,
file_name="blog_articles.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
except Exception as e:
st.error(f"Error during scraping: {e}")