SletcherSystems / crawl_website.py
SmokeyBandit's picture
Update crawl_website.py
1fc4326 verified
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time
class SletcherScraper:
def __init__(self):
self.base_url = "https://www.sletchersystems.com"
self.pages = [
"/",
"/clients",
"/solutions",
"/services",
"/about"
]
self.content = {
"timestamp": datetime.now().isoformat(),
"pages": {},
"company_info": {},
"services": [],
"solutions": [],
"clients": []
}
def clean_text(self, text):
if text:
return " ".join(text.strip().split())
return ""
def scrape_page(self, url_path):
full_url = self.base_url + url_path
try:
response = requests.get(full_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
page_data = {
"url": full_url,
"title": self.clean_text(soup.title.string) if soup.title else "",
"sections": []
}
# Extract main content sections
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
section_data = {
"heading": "",
"content": ""
}
# Get heading
heading = section.find(['h1', 'h2', 'h3'])
if heading:
section_data["heading"] = self.clean_text(heading.text)
# Get content paragraphs
paragraphs = section.find_all('p')
section_data["content"] = "\n".join([
self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
])
if section_data["heading"] or section_data["content"]:
page_data["sections"].append(section_data)
return page_data
except Exception as e:
print(f"Error scraping {full_url}: {e}")
return None
def extract_specific_content(self):
# Extract services
if "/services" in self.content["pages"]:
services_page = self.content["pages"]["/services"]
for section in services_page["sections"]:
if section["heading"] and section["content"]:
self.content["services"].append({
"name": section["heading"],
"description": section["content"]
})
# Extract solutions
if "/solutions" in self.content["pages"]:
solutions_page = self.content["pages"]["/solutions"]
for section in solutions_page["sections"]:
if section["heading"] and section["content"]:
self.content["solutions"].append({
"name": section["heading"],
"description": section["content"]
})
# Extract company info from about page
if "/about" in self.content["pages"]:
about_page = self.content["pages"]["/about"]
self.content["company_info"] = {
"name": "SletcherSystems",
"description": "\n".join([
section["content"] for section in about_page["sections"]
if section["content"]
])
}
def scrape_all(self):
# Scrape each page
for page in self.pages:
print(f"Scraping {self.base_url}{page}")
page_data = self.scrape_page(page)
if page_data:
self.content["pages"][page] = page_data
time.sleep(1) # Be nice to the server
# Extract specific content
self.extract_specific_content()
return self.content
def save_to_json(self, filename="site_content.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(self.content, f, indent=2, ensure_ascii=False)
def main():
scraper = SletcherScraper()
content = scraper.scrape_all()
scraper.save_to_json("data/site_content.json")
print("Scraping completed and saved to data/site_content.json")
if __name__ == "__main__":
main()