Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from datetime import datetime | |
import time | |
class SletcherScraper: | |
def __init__(self): | |
self.base_url = "https://www.sletchersystems.com" | |
self.pages = [ | |
"/", | |
"/clients", | |
"/solutions", | |
"/services", | |
"/about" | |
] | |
self.content = { | |
"timestamp": datetime.now().isoformat(), | |
"pages": {}, | |
"company_info": {}, | |
"services": [], | |
"solutions": [], | |
"clients": [] | |
} | |
def clean_text(self, text): | |
if text: | |
return " ".join(text.strip().split()) | |
return "" | |
def scrape_page(self, url_path): | |
full_url = self.base_url + url_path | |
try: | |
response = requests.get(full_url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
page_data = { | |
"url": full_url, | |
"title": self.clean_text(soup.title.string) if soup.title else "", | |
"sections": [] | |
} | |
# Extract main content sections | |
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()): | |
section_data = { | |
"heading": "", | |
"content": "" | |
} | |
# Get heading | |
heading = section.find(['h1', 'h2', 'h3']) | |
if heading: | |
section_data["heading"] = self.clean_text(heading.text) | |
# Get content paragraphs | |
paragraphs = section.find_all('p') | |
section_data["content"] = "\n".join([ | |
self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text) | |
]) | |
if section_data["heading"] or section_data["content"]: | |
page_data["sections"].append(section_data) | |
return page_data | |
except Exception as e: | |
print(f"Error scraping {full_url}: {e}") | |
return None | |
def extract_specific_content(self): | |
# Extract services | |
if "/services" in self.content["pages"]: | |
services_page = self.content["pages"]["/services"] | |
for section in services_page["sections"]: | |
if section["heading"] and section["content"]: | |
self.content["services"].append({ | |
"name": section["heading"], | |
"description": section["content"] | |
}) | |
# Extract solutions | |
if "/solutions" in self.content["pages"]: | |
solutions_page = self.content["pages"]["/solutions"] | |
for section in solutions_page["sections"]: | |
if section["heading"] and section["content"]: | |
self.content["solutions"].append({ | |
"name": section["heading"], | |
"description": section["content"] | |
}) | |
# Extract company info from about page | |
if "/about" in self.content["pages"]: | |
about_page = self.content["pages"]["/about"] | |
self.content["company_info"] = { | |
"name": "SletcherSystems", | |
"description": "\n".join([ | |
section["content"] for section in about_page["sections"] | |
if section["content"] | |
]) | |
} | |
def scrape_all(self): | |
# Scrape each page | |
for page in self.pages: | |
print(f"Scraping {self.base_url}{page}") | |
page_data = self.scrape_page(page) | |
if page_data: | |
self.content["pages"][page] = page_data | |
time.sleep(1) # Be nice to the server | |
# Extract specific content | |
self.extract_specific_content() | |
return self.content | |
def save_to_json(self, filename="site_content.json"): | |
with open(filename, "w", encoding="utf-8") as f: | |
json.dump(self.content, f, indent=2, ensure_ascii=False) | |
def main(): | |
scraper = SletcherScraper() | |
content = scraper.scrape_all() | |
scraper.save_to_json("data/site_content.json") | |
print("Scraping completed and saved to data/site_content.json") | |
if __name__ == "__main__": | |
main() |