Spaces:
Sleeping
Sleeping
File size: 4,493 Bytes
1fc4326 af5fcd0 1fc4326 abb227a 1fc4326 abb227a 1fc4326 af5fcd0 1fc4326 af5fcd0 1fc4326 af5fcd0 1fc4326 af5fcd0 1fc4326 af5fcd0 1fc4326 abb227a 1fc4326 af5fcd0 25beff5 1fc4326 af5fcd0 1fc4326 25beff5 abb227a 1fc4326 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time
class SletcherScraper:
def __init__(self):
self.base_url = "https://www.sletchersystems.com"
self.pages = [
"/",
"/clients",
"/solutions",
"/services",
"/about"
]
self.content = {
"timestamp": datetime.now().isoformat(),
"pages": {},
"company_info": {},
"services": [],
"solutions": [],
"clients": []
}
def clean_text(self, text):
if text:
return " ".join(text.strip().split())
return ""
def scrape_page(self, url_path):
full_url = self.base_url + url_path
try:
response = requests.get(full_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
page_data = {
"url": full_url,
"title": self.clean_text(soup.title.string) if soup.title else "",
"sections": []
}
# Extract main content sections
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
section_data = {
"heading": "",
"content": ""
}
# Get heading
heading = section.find(['h1', 'h2', 'h3'])
if heading:
section_data["heading"] = self.clean_text(heading.text)
# Get content paragraphs
paragraphs = section.find_all('p')
section_data["content"] = "\n".join([
self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
])
if section_data["heading"] or section_data["content"]:
page_data["sections"].append(section_data)
return page_data
except Exception as e:
print(f"Error scraping {full_url}: {e}")
return None
def extract_specific_content(self):
# Extract services
if "/services" in self.content["pages"]:
services_page = self.content["pages"]["/services"]
for section in services_page["sections"]:
if section["heading"] and section["content"]:
self.content["services"].append({
"name": section["heading"],
"description": section["content"]
})
# Extract solutions
if "/solutions" in self.content["pages"]:
solutions_page = self.content["pages"]["/solutions"]
for section in solutions_page["sections"]:
if section["heading"] and section["content"]:
self.content["solutions"].append({
"name": section["heading"],
"description": section["content"]
})
# Extract company info from about page
if "/about" in self.content["pages"]:
about_page = self.content["pages"]["/about"]
self.content["company_info"] = {
"name": "SletcherSystems",
"description": "\n".join([
section["content"] for section in about_page["sections"]
if section["content"]
])
}
def scrape_all(self):
# Scrape each page
for page in self.pages:
print(f"Scraping {self.base_url}{page}")
page_data = self.scrape_page(page)
if page_data:
self.content["pages"][page] = page_data
time.sleep(1) # Be nice to the server
# Extract specific content
self.extract_specific_content()
return self.content
def save_to_json(self, filename="site_content.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(self.content, f, indent=2, ensure_ascii=False)
def main():
scraper = SletcherScraper()
content = scraper.scrape_all()
scraper.save_to_json("data/site_content.json")
print("Scraping completed and saved to data/site_content.json")
if __name__ == "__main__":
main() |