Spaces:

SmokeyBandit
/

SletcherSystems

Sleeping

App Files Files Community

SletcherSystems / crawl_website.py

SmokeyBandit

Update crawl_website.py

1fc4326 verified 3 months ago

raw

history blame contribute delete

4.49 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	from datetime import datetime
	import time

	class SletcherScraper:
	def __init__(self):
	self.base_url = "https://www.sletchersystems.com"
	self.pages = [
	"/",
	"/clients",
	"/solutions",
	"/services",
	"/about"
	]
	self.content = {
	"timestamp": datetime.now().isoformat(),
	"pages": {},
	"company_info": {},
	"services": [],
	"solutions": [],
	"clients": []
	}

	def clean_text(self, text):
	if text:
	return " ".join(text.strip().split())
	return ""

	def scrape_page(self, url_path):
	full_url = self.base_url + url_path
	try:
	response = requests.get(full_url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	page_data = {
	"url": full_url,
	"title": self.clean_text(soup.title.string) if soup.title else "",
	"sections": []
	}

	# Extract main content sections
	for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
	section_data = {
	"heading": "",
	"content": ""
	}

	# Get heading
	heading = section.find(['h1', 'h2', 'h3'])
	if heading:
	section_data["heading"] = self.clean_text(heading.text)

	# Get content paragraphs
	paragraphs = section.find_all('p')
	section_data["content"] = "\n".join([
	self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
	])

	if section_data["heading"] or section_data["content"]:
	page_data["sections"].append(section_data)

	return page_data

	except Exception as e:
	print(f"Error scraping {full_url}: {e}")
	return None

	def extract_specific_content(self):
	# Extract services
	if "/services" in self.content["pages"]:
	services_page = self.content["pages"]["/services"]
	for section in services_page["sections"]:
	if section["heading"] and section["content"]:
	self.content["services"].append({
	"name": section["heading"],
	"description": section["content"]
	})

	# Extract solutions
	if "/solutions" in self.content["pages"]:
	solutions_page = self.content["pages"]["/solutions"]
	for section in solutions_page["sections"]:
	if section["heading"] and section["content"]:
	self.content["solutions"].append({
	"name": section["heading"],
	"description": section["content"]
	})

	# Extract company info from about page
	if "/about" in self.content["pages"]:
	about_page = self.content["pages"]["/about"]
	self.content["company_info"] = {
	"name": "SletcherSystems",
	"description": "\n".join([
	section["content"] for section in about_page["sections"]
	if section["content"]
	])
	}

	def scrape_all(self):
	# Scrape each page
	for page in self.pages:
	print(f"Scraping {self.base_url}{page}")
	page_data = self.scrape_page(page)
	if page_data:
	self.content["pages"][page] = page_data
	time.sleep(1) # Be nice to the server

	# Extract specific content
	self.extract_specific_content()

	return self.content

	def save_to_json(self, filename="site_content.json"):
	with open(filename, "w", encoding="utf-8") as f:
	json.dump(self.content, f, indent=2, ensure_ascii=False)

	def main():
	scraper = SletcherScraper()
	content = scraper.scrape_all()
	scraper.save_to_json("data/site_content.json")
	print("Scraping completed and saved to data/site_content.json")

	if __name__ == "__main__":
	main()