Spaces:

abdulllah01
/

articles_from_sitemap

Running

App Files Files Community

abdulllah01 commited on 25 days ago

Commit

f9d793f

verified ·

1 Parent(s): 5c47698

Create app.py

Browse files

Files changed (1) hide show

app.py +129 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import streamlit as st
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+# Helper Functions
+COMMON_SITEMAP_LOCATIONS = [
+    "/sitemap.xml",
+    "/sitemap_index.xml",
+    "/sitemap-index.xml",
+    "/sitemap.php",
+    "/sitemap.txt",
+    "/sitemap.xml.gz",
+    "/sitemap/",
+    "/sitemap/sitemap.xml",
+    "/sitemapindex.xml",
+    "/sitemap/index.xml",
+    "/sitemap1.xml",
+    "/rss/",
+    "/rss.xml",
+    "/atom.xml",
+]
+def find_sitemap(domain):
+    """Locate the sitemap URL by checking common locations and robots.txt."""
+    for path in COMMON_SITEMAP_LOCATIONS:
+        sitemap_url = domain.rstrip("/") + path
+        try:
+            response = requests.get(sitemap_url, timeout=5)
+            if response.status_code == 200:
+                return sitemap_url
+        except requests.RequestException:
+            continue
+    robots_url = domain.rstrip("/") + "/robots.txt"
+    try:
+        response = requests.get(robots_url, timeout=5)
+        if response.status_code == 200:
+            for line in response.text.splitlines():
+                if line.lower().startswith("sitemap:"):
+                    return line.split(":", 1)[1].strip()
+    except requests.RequestException:
+        pass
+    return None
+def get_sitemap_links(sitemap_url):
+    """Fetch all links from a sitemap."""
+    response = requests.get(sitemap_url)
+    if response.status_code != 200:
+        raise Exception(f"Failed to fetch sitemap: {response.status_code}")
+    soup = BeautifulSoup(response.content, "lxml-xml")
+    links = [loc.text.strip() for loc in soup.find_all("loc")]
+    return links
+def is_blog_link(link):
+    """Filter links for blog-related URLs."""
+    return "blog" in link.lower()
+def extract_article_info(url):
+    """Extract the article content from a URL."""
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        response.encoding = response.apparent_encoding
+        soup = BeautifulSoup(response.text, "html.parser")
+        heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
+        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
+        article_text = "\n\n".join(all_paragraphs)
+        full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
+        return full_article_text
+    except Exception as e:
+        return f"Error fetching article: {e}"
+# Streamlit App
+st.title("Blog Article Scraper")
+# Input Website URL
+website_url = st.text_input("Enter the website URL (e.g., https://example.com):")
+if st.button("Start Scraping"):
+    if not website_url:
+        st.error("Please enter a website URL.")
+    else:
+        st.write("Locating sitemap...")
+        sitemap_url = find_sitemap(website_url)
+        if not sitemap_url:
+            st.error("Could not locate a sitemap.")
+        else:
+            st.success(f"Sitemap found: {sitemap_url}")
+            st.write("Fetching links from sitemap...")
+            try:
+                links = get_sitemap_links(sitemap_url)
+                blog_links = [link for link in links if is_blog_link(link)]
+                st.success(f"Found {len(blog_links)} blog links.")
+                # Scraping articles
+                st.write("Extracting article content...")
+                data = []
+                progress = st.progress(0)
+                for i, link in enumerate(blog_links):
+                    article_text = extract_article_info(link)
+                    data.append({"URL": link, "Article Text": article_text})
+                    progress.progress((i + 1) / len(blog_links))
+                # Save results to DataFrame and Display
+                df = pd.DataFrame(data)
+                st.write("Scraping completed.")
+                st.dataframe(df)
+                # Provide download link for Excel file
+                file_name = "blog_articles.xlsx"
+                df.to_excel(file_name, index=False)
+                with open(file_name, "rb") as file:
+                    st.download_button(
+                        label="Download Excel File",
+                        data=file,
+                        file_name="blog_articles.xlsx",
+                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    )
+            except Exception as e:
+                st.error(f"Error during scraping: {e}")