In [8]:
import os
import requests
import trafilatura
import textwrap

from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque

In [2]:
BASE_URLS = [
 "https://www.bostonpublicschools.org/enroll",
 "https://www.bostonpublicschools.org/enrollment/welcome-services/registration",
 "https://www.bostonpublicschools.org/enrollment/greatstarts-container/greatstarts",
]

DOMAIN = "www.bostonpublicschools.org"


In [3]:
def sanitize_url_to_key(url):
 """
 Convert a URL's path into a dash-separated key.
 If the path is empty, return "root".
 """
 path = urlparse(url).path.strip("/")
 if not path:
 return "root"
 return path.replace("/", "--")


def unhide_elements(html):
 """
 Modify the HTML to unhide hidden elements by:
 - Removing display:none and visibility hidden rules,
 - Removing aria-hidden attributes,
 - Forcing all tags to have visible styles.
 """
 soup = BeautifulSoup(html, "html.parser")

 # Unhide elements by adjusting style and aria-hidden attributes
 for tag in soup.select('[style*="display:none"], [aria-hidden="true"]'):
 if tag.has_attr("style"):
 styles = tag["style"].split(";")
 # Filter out any style setting for display or visibility
 styles = [
 s for s in styles if not any(x in s for x in ["display", "visibility"])
 ]
 tag["style"] = ";".join(styles)
 if tag.has_attr("aria-hidden"):
 del tag["aria-hidden"]

 # Force visibility on all tags
 for tag in soup.find_all(True):
 existing_style = tag.get("style", "")
 tag["style"] = existing_style + ";display:block;visibility:visible"

 return str(soup)



In [4]:
visited = set()
queue = deque(BASE_URLS)
results = {}

In [5]:
while queue:
 current_url = queue.popleft()
 if current_url in visited:
 continue
 visited.add(current_url)

 try:
 response = requests.get(current_url, timeout=10, allow_redirects=True)
 final_url = response.url
 parsed_final = urlparse(final_url)

 # Skip if redirected to an external site
 if parsed_final.netloc != DOMAIN:
 print(f"[SKIPPED] {current_url} → redirected to {final_url} (external)")
 continue

 if response.status_code != 200:
 print(f"[FAILED] {current_url} returned status code {response.status_code}")
 continue
 except Exception as e:
 print(f"[FAILED] {current_url} -> Exception: {e}")
 continue

 # Unhide hidden elements in HTML before extraction
 cleaned_html = unhide_elements(response.text)

 # Use trafilatura to extract clean text
 extracted = trafilatura.extract(cleaned_html)
 if not extracted:
 print(f"[SKIPPED] {final_url} — no extractable content.")
 continue

 # Create a unique key based on the FINAL resolved URL
 key = sanitize_url_to_key(final_url)
 if key in results:
 # Append a suffix if there's a key collision
 i = 1
 while f"{key}--{i}" in results:
 i += 1
 key = f"{key}--{i}"

 results[key] = {"url": final_url, "info": extracted.strip()}

 print(f"[PARSED] {final_url} → {key}")

 # Find internal links to add to the crawl queue
 soup = BeautifulSoup(response.text, "html.parser")
 for a in soup.find_all("a", href=True):
 link = urljoin(final_url, a["href"])
 parsed = urlparse(link)
 if parsed.netloc == DOMAIN and parsed.scheme.startswith("http"):
 if "#" not in link and link not in visited:
 queue.append(link)

[PARSED] https://www.bostonpublicschools.org/enroll → enroll
[PARSED] https://www.bostonpublicschools.org/enrollment/welcome-services/registration → enrollment--welcome-services--registration
[PARSED] https://www.bostonpublicschools.org/enrollment/greatstarts-container/greatstarts → enrollment--greatstarts-container--greatstarts
[PARSED] https://www.bostonpublicschools.org/ → root
[PARSED] https://www.bostonpublicschools.org/schools → schools
[PARSED] https://www.bostonpublicschools.org/about-bps/office-of-the-superintendent/superintendent-mary-skipper → about-bps--office-of-the-superintendent--superintendent-mary-skipper
[PARSED] https://www.bostonpublicschools.org/about-bps/office-of-the-superintendent/superintendent-mary-skipper → about-bps--office-of-the-superintendent--superintendent-mary-skipper--1
[PARSED] https://www.bostonpublicschools.org/about-bps/office-of-the-superintendent/executive-team → about-bps--office-of-the-superintendent--executive-team
[PARSED] https://www.boston

KeyboardInterrupt: 

In [9]:
# Ensure the "pages" folder exists
os.makedirs("pages", exist_ok=True)

for key, data in results.items():
 filepath = os.path.join("pages", f"{key}.txt")

 try:
 with open(filepath, "w", encoding="utf-8") as f:
 f.write(data["info"])
 print(f"[SAVED] {key} → {filepath}")
 except Exception as e:
 print(f"[ERROR] Failed to save {key} → {filepath}: {e}")

[SAVED] enroll → pages/enroll.txt
[SAVED] enrollment--welcome-services--registration → pages/enrollment--welcome-services--registration.txt
[SAVED] enrollment--greatstarts-container--greatstarts → pages/enrollment--greatstarts-container--greatstarts.txt
[SAVED] root → pages/root.txt
[SAVED] schools → pages/schools.txt
[SAVED] about-bps--office-of-the-superintendent--superintendent-mary-skipper → pages/about-bps--office-of-the-superintendent--superintendent-mary-skipper.txt
[SAVED] about-bps--office-of-the-superintendent--superintendent-mary-skipper--1 → pages/about-bps--office-of-the-superintendent--superintendent-mary-skipper--1.txt
[SAVED] about-bps--office-of-the-superintendent--executive-team → pages/about-bps--office-of-the-superintendent--executive-team.txt
[SAVED] about-bps--office-of-the-superintendent--organizational-chart → pages/about-bps--office-of-the-superintendent--organizational-chart.txt
[SAVED] about-bps--office-of-the-superintendent--past-superintendents → pages/abou

In [None]:
for k, v in results.items():
 print(f"\n[{k}]")
 print(v["url"])
 print(textwrap.shorten(v["info"], 300))

In [None]:
# os.makedirs("pages", exist_ok=True)

# while queue:
# current_url = queue.popleft()

# if current_url in visited:
# continue

# try:
# response = requests.get(current_url, timeout=10, allow_redirects=True)
# final_url = response.url
# parsed_final = urlparse(final_url)

# # Skip if redirected to an external site
# if parsed_final.netloc != DOMAIN:
# print(f"[SKIPPED] {current_url} → redirected to {final_url} (external)")
# continue

# # Skip if not HTML
# content_type = response.headers.get("Content-Type", "")
# if not content_type.startswith("text/html"):
# print(f"[SKIPPED] {final_url} — content type {content_type}")
# continue

# if response.status_code != 200:
# print(f"[FAILED] {current_url} returned status code {response.status_code}")
# continue

# except Exception as e:
# print(f"[FAILED] {current_url} -> Exception: {e}")
# continue

# visited.add(final_url)

# # Unhide hidden elements in HTML before extraction
# cleaned_html = unhide_elements(response.text)

# # Use trafilatura to extract clean text
# extracted = trafilatura.extract(cleaned_html)
# if not extracted:
# print(f"[SKIPPED] {final_url} — no extractable content.")
# continue

# # Create a unique key based on the FINAL resolved URL
# key = sanitize_url_to_key(final_url)
# if key in results:
# i = 1
# while f"{key}--{i}" in results:
# i += 1
# key = f"{key}--{i}"

# # Store in dictionary
# results[key] = {"url": final_url, "info": extracted.strip()}

# # Save to disk
# filepath = os.path.join("pages", f"{key}.txt")
# with open(filepath, "w", encoding="utf-8") as f:
# f.write(extracted.strip())

# print(f"[PARSED] {final_url} → {key} (saved to {filepath})")

# # Find internal links to add to the crawl queue
# soup = BeautifulSoup(response.text, "html.parser")
# for a in soup.find_all("a", href=True):
# link = urljoin(final_url, a["href"])
# parsed = urlparse(link)

# if parsed.netloc == DOMAIN and parsed.scheme.startswith("http"):
# path = parsed.path.strip("/")

# # Skip specific URL path prefixes
# if path.startswith(("fs/resource-manager", "Page/", "cms/", "archive/")):
# continue

# if "#" not in link and link not in visited:
# queue.append(link)