import streamlit as st import requests import pandas as pd from bs4 import BeautifulSoup # Helper Functions COMMON_SITEMAP_LOCATIONS = [ "/sitemap.xml", "/sitemap_index.xml", "/sitemap-index.xml", "/sitemap.php", "/sitemap.txt", "/sitemap.xml.gz", "/sitemap/", "/sitemap/sitemap.xml", "/sitemapindex.xml", "/sitemap/index.xml", "/sitemap1.xml", "/rss/", "/rss.xml", "/atom.xml", ] def find_sitemap(domain): """Locate the sitemap URL by checking common locations and robots.txt.""" for path in COMMON_SITEMAP_LOCATIONS: sitemap_url = domain.rstrip("/") + path try: response = requests.get(sitemap_url, timeout=5) if response.status_code == 200: return sitemap_url except requests.RequestException: continue robots_url = domain.rstrip("/") + "/robots.txt" try: response = requests.get(robots_url, timeout=5) if response.status_code == 200: for line in response.text.splitlines(): if line.lower().startswith("sitemap:"): return line.split(":", 1)[1].strip() except requests.RequestException: pass return None def get_sitemap_links(sitemap_url): """Fetch all links from a sitemap.""" response = requests.get(sitemap_url) if response.status_code != 200: raise Exception(f"Failed to fetch sitemap: {response.status_code}") soup = BeautifulSoup(response.content, "lxml-xml") links = [loc.text.strip() for loc in soup.find_all("loc")] return links def is_blog_link(link): """Filter links for blog-related URLs.""" return "blog" in link.lower() def extract_article_info(url): """Extract the article content from a URL.""" try: response = requests.get(url, timeout=10) response.raise_for_status() response.encoding = response.apparent_encoding soup = BeautifulSoup(response.text, "html.parser") heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")] article_text = "\n\n".join(all_paragraphs) full_article_text = f"{heading}\n\n{article_text}" if heading else article_text return full_article_text except Exception as e: return f"Error fetching article: {e}" # Streamlit App st.title("Blog Article Scraper") # Input Website URL website_url = st.text_input("Enter the website URL (e.g., https://example.com):") if st.button("Start Scraping"): if not website_url: st.error("Please enter a website URL.") else: st.write("Locating sitemap...") sitemap_url = find_sitemap(website_url) if not sitemap_url: st.error("Could not locate a sitemap.") else: st.success(f"Sitemap found: {sitemap_url}") st.write("Fetching links from sitemap...") try: links = get_sitemap_links(sitemap_url) blog_links = [link for link in links if is_blog_link(link)] st.success(f"Found {len(blog_links)} blog links.") # Scraping articles st.write("Extracting article content...") data = [] progress = st.progress(0) for i, link in enumerate(blog_links): article_text = extract_article_info(link) data.append({"URL": link, "Article Text": article_text}) progress.progress((i + 1) / len(blog_links)) # Save results to DataFrame and Display df = pd.DataFrame(data) st.write("Scraping completed.") st.dataframe(df) # Provide download link for Excel file file_name = "blog_articles.xlsx" df.to_excel(file_name, index=False) with open(file_name, "rb") as file: st.download_button( label="Download Excel File", data=file, file_name="blog_articles.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) except Exception as e: st.error(f"Error during scraping: {e}")