File size: 4,493 Bytes
1fc4326
af5fcd0
1fc4326
 
 
abb227a
1fc4326
abb227a
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
af5fcd0
1fc4326
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
abb227a
1fc4326
 
 
 
 
 
 
 
 
af5fcd0
25beff5
1fc4326
 
 
 
 
 
 
 
 
 
 
 
 
af5fcd0
1fc4326
 
 
 
 
 
 
 
 
25beff5
abb227a
1fc4326
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time

class SletcherScraper:
    def __init__(self):
        self.base_url = "https://www.sletchersystems.com"
        self.pages = [
            "/",
            "/clients",
            "/solutions",
            "/services",
            "/about"
        ]
        self.content = {
            "timestamp": datetime.now().isoformat(),
            "pages": {},
            "company_info": {},
            "services": [],
            "solutions": [],
            "clients": []
        }
        
    def clean_text(self, text):
        if text:
            return " ".join(text.strip().split())
        return ""
    
    def scrape_page(self, url_path):
        full_url = self.base_url + url_path
        try:
            response = requests.get(full_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            page_data = {
                "url": full_url,
                "title": self.clean_text(soup.title.string) if soup.title else "",
                "sections": []
            }
            
            # Extract main content sections
            for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
                section_data = {
                    "heading": "",
                    "content": ""
                }
                
                # Get heading
                heading = section.find(['h1', 'h2', 'h3'])
                if heading:
                    section_data["heading"] = self.clean_text(heading.text)
                
                # Get content paragraphs
                paragraphs = section.find_all('p')
                section_data["content"] = "\n".join([
                    self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
                ])
                
                if section_data["heading"] or section_data["content"]:
                    page_data["sections"].append(section_data)
            
            return page_data
            
        except Exception as e:
            print(f"Error scraping {full_url}: {e}")
            return None
    
    def extract_specific_content(self):
        # Extract services
        if "/services" in self.content["pages"]:
            services_page = self.content["pages"]["/services"]
            for section in services_page["sections"]:
                if section["heading"] and section["content"]:
                    self.content["services"].append({
                        "name": section["heading"],
                        "description": section["content"]
                    })
        
        # Extract solutions
        if "/solutions" in self.content["pages"]:
            solutions_page = self.content["pages"]["/solutions"]
            for section in solutions_page["sections"]:
                if section["heading"] and section["content"]:
                    self.content["solutions"].append({
                        "name": section["heading"],
                        "description": section["content"]
                    })
        
        # Extract company info from about page
        if "/about" in self.content["pages"]:
            about_page = self.content["pages"]["/about"]
            self.content["company_info"] = {
                "name": "SletcherSystems",
                "description": "\n".join([
                    section["content"] for section in about_page["sections"]
                    if section["content"]
                ])
            }
    
    def scrape_all(self):
        # Scrape each page
        for page in self.pages:
            print(f"Scraping {self.base_url}{page}")
            page_data = self.scrape_page(page)
            if page_data:
                self.content["pages"][page] = page_data
            time.sleep(1)  # Be nice to the server
        
        # Extract specific content
        self.extract_specific_content()
        
        return self.content
    
    def save_to_json(self, filename="site_content.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.content, f, indent=2, ensure_ascii=False)

def main():
    scraper = SletcherScraper()
    content = scraper.scrape_all()
    scraper.save_to_json("data/site_content.json")
    print("Scraping completed and saved to data/site_content.json")

if __name__ == "__main__":
    main()