abdulllah01 commited on
Commit
f9d793f
·
verified ·
1 Parent(s): 5c47698

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+ from bs4 import BeautifulSoup
5
+
6
+ # Helper Functions
7
+ COMMON_SITEMAP_LOCATIONS = [
8
+ "/sitemap.xml",
9
+ "/sitemap_index.xml",
10
+ "/sitemap-index.xml",
11
+ "/sitemap.php",
12
+ "/sitemap.txt",
13
+ "/sitemap.xml.gz",
14
+ "/sitemap/",
15
+ "/sitemap/sitemap.xml",
16
+ "/sitemapindex.xml",
17
+ "/sitemap/index.xml",
18
+ "/sitemap1.xml",
19
+ "/rss/",
20
+ "/rss.xml",
21
+ "/atom.xml",
22
+ ]
23
+
24
+
25
+ def find_sitemap(domain):
26
+ """Locate the sitemap URL by checking common locations and robots.txt."""
27
+ for path in COMMON_SITEMAP_LOCATIONS:
28
+ sitemap_url = domain.rstrip("/") + path
29
+ try:
30
+ response = requests.get(sitemap_url, timeout=5)
31
+ if response.status_code == 200:
32
+ return sitemap_url
33
+ except requests.RequestException:
34
+ continue
35
+
36
+ robots_url = domain.rstrip("/") + "/robots.txt"
37
+ try:
38
+ response = requests.get(robots_url, timeout=5)
39
+ if response.status_code == 200:
40
+ for line in response.text.splitlines():
41
+ if line.lower().startswith("sitemap:"):
42
+ return line.split(":", 1)[1].strip()
43
+ except requests.RequestException:
44
+ pass
45
+ return None
46
+
47
+
48
+ def get_sitemap_links(sitemap_url):
49
+ """Fetch all links from a sitemap."""
50
+ response = requests.get(sitemap_url)
51
+ if response.status_code != 200:
52
+ raise Exception(f"Failed to fetch sitemap: {response.status_code}")
53
+
54
+ soup = BeautifulSoup(response.content, "lxml-xml")
55
+ links = [loc.text.strip() for loc in soup.find_all("loc")]
56
+ return links
57
+
58
+
59
+ def is_blog_link(link):
60
+ """Filter links for blog-related URLs."""
61
+ return "blog" in link.lower()
62
+
63
+
64
+ def extract_article_info(url):
65
+ """Extract the article content from a URL."""
66
+ try:
67
+ response = requests.get(url, timeout=10)
68
+ response.raise_for_status()
69
+ response.encoding = response.apparent_encoding
70
+ soup = BeautifulSoup(response.text, "html.parser")
71
+
72
+ heading = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
73
+ all_paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
74
+ article_text = "\n\n".join(all_paragraphs)
75
+ full_article_text = f"{heading}\n\n{article_text}" if heading else article_text
76
+ return full_article_text
77
+ except Exception as e:
78
+ return f"Error fetching article: {e}"
79
+
80
+
81
+ # Streamlit App
82
+ st.title("Blog Article Scraper")
83
+
84
+ # Input Website URL
85
+ website_url = st.text_input("Enter the website URL (e.g., https://example.com):")
86
+
87
+ if st.button("Start Scraping"):
88
+ if not website_url:
89
+ st.error("Please enter a website URL.")
90
+ else:
91
+ st.write("Locating sitemap...")
92
+ sitemap_url = find_sitemap(website_url)
93
+
94
+ if not sitemap_url:
95
+ st.error("Could not locate a sitemap.")
96
+ else:
97
+ st.success(f"Sitemap found: {sitemap_url}")
98
+ st.write("Fetching links from sitemap...")
99
+ try:
100
+ links = get_sitemap_links(sitemap_url)
101
+ blog_links = [link for link in links if is_blog_link(link)]
102
+ st.success(f"Found {len(blog_links)} blog links.")
103
+
104
+ # Scraping articles
105
+ st.write("Extracting article content...")
106
+ data = []
107
+ progress = st.progress(0)
108
+ for i, link in enumerate(blog_links):
109
+ article_text = extract_article_info(link)
110
+ data.append({"URL": link, "Article Text": article_text})
111
+ progress.progress((i + 1) / len(blog_links))
112
+
113
+ # Save results to DataFrame and Display
114
+ df = pd.DataFrame(data)
115
+ st.write("Scraping completed.")
116
+ st.dataframe(df)
117
+
118
+ # Provide download link for Excel file
119
+ file_name = "blog_articles.xlsx"
120
+ df.to_excel(file_name, index=False)
121
+ with open(file_name, "rb") as file:
122
+ st.download_button(
123
+ label="Download Excel File",
124
+ data=file,
125
+ file_name="blog_articles.xlsx",
126
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
127
+ )
128
+ except Exception as e:
129
+ st.error(f"Error during scraping: {e}")