Spaces:

SmokeyBandit
/

SletcherSystems

Sleeping

File size: 4,493 Bytes

1fc4326
af5fcd0
1fc4326
 
 
abb227a
1fc4326
abb227a
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
af5fcd0
1fc4326
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
abb227a
1fc4326
 
 
 
 
 
 
 
 
af5fcd0
25beff5
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
25beff5
abb227a
1fc4326

import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time

class SletcherScraper:
    def __init__(self):
        self.base_url = "https://www.sletchersystems.com"
        self.pages = [
            "/",
            "/clients",
            "/solutions",
            "/services",
            "/about"
        ]
        self.content = {
            "timestamp": datetime.now().isoformat(),
            "pages": {},
            "company_info": {},
            "services": [],
            "solutions": [],
            "clients": []
        }
        
    def clean_text(self, text):
        if text:
            return " ".join(text.strip().split())
        return ""
    
    def scrape_page(self, url_path):
        full_url = self.base_url + url_path
        try:
            response = requests.get(full_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            page_data = {
                "url": full_url,
                "title": self.clean_text(soup.title.string) if soup.title else "",
                "sections": []
            }
            
            # Extract main content sections
            for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
                section_data = {
                    "heading": "",
                    "content": ""
                }
                
                # Get heading
                heading = section.find(['h1', 'h2', 'h3'])
                if heading:
                    section_data["heading"] = self.clean_text(heading.text)
                
                # Get content paragraphs
                paragraphs = section.find_all('p')
                section_data["content"] = "\n".join([
                    self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
                ])
                
                if section_data["heading"] or section_data["content"]:
                    page_data["sections"].append(section_data)
            
            return page_data
            
        except Exception as e:
            print(f"Error scraping {full_url}: {e}")
            return None
    
    def extract_specific_content(self):
        # Extract services
        if "/services" in self.content["pages"]:
            services_page = self.content["pages"]["/services"]
            for section in services_page["sections"]:
                if section["heading"] and section["content"]:
                    self.content["services"].append({
                        "name": section["heading"],
                        "description": section["content"]
                    })
        
        # Extract solutions
        if "/solutions" in self.content["pages"]:
            solutions_page = self.content["pages"]["/solutions"]
            for section in solutions_page["sections"]:
                if section["heading"] and section["content"]:
                    self.content["solutions"].append({
                        "name": section["heading"],
                        "description": section["content"]
                    })
        
        # Extract company info from about page
        if "/about" in self.content["pages"]:
            about_page = self.content["pages"]["/about"]
            self.content["company_info"] = {
                "name": "SletcherSystems",
                "description": "\n".join([
                    section["content"] for section in about_page["sections"]
                    if section["content"]
                ])
            }
    
    def scrape_all(self):
        # Scrape each page
        for page in self.pages:
            print(f"Scraping {self.base_url}{page}")
            page_data = self.scrape_page(page)
            if page_data:
                self.content["pages"][page] = page_data
            time.sleep(1)  # Be nice to the server
        
        # Extract specific content
        self.extract_specific_content()
        
        return self.content
    
    def save_to_json(self, filename="site_content.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.content, f, indent=2, ensure_ascii=False)

def main():
    scraper = SletcherScraper()
    content = scraper.scrape_all()
    scraper.save_to_json("data/site_content.json")
    print("Scraping completed and saved to data/site_content.json")

if __name__ == "__main__":
    main()