Spaces:

SmokeyBandit
/

SletcherSystems

Sleeping

File size: 3,679 Bytes

7df492c

import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from huggingface_hub import HfApi

def clean_text(text):
    if text:
        return " ".join(text.strip().split())
    return ""

def scrape_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        sections = []
        for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
            section_data = {"heading": "", "content": ""}
            
            # Get heading
            heading = section.find(['h1', 'h2', 'h3'])
            if heading:
                section_data["heading"] = clean_text(heading.text)
            
            # Get content
            paragraphs = section.find_all('p')
            content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)])
            if not content:  # If no paragraphs, get all text
                content = clean_text(section.get_text())
            section_data["content"] = content
            
            if section_data["heading"] or section_data["content"]:
                sections.append(section_data)
        
        return {
            "title": clean_text(soup.title.string) if soup.title else "",
            "sections": sections
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"title": "", "sections": []}

def update_content():
    # Load existing structure
    try:
        with open("data/site_content.json", "r") as f:
            content = json.load(f)
    except Exception as e:
        print(f"Error loading JSON: {e}")
        return

    # Update timestamp
    content["timestamp"] = datetime.now().isoformat()

    # Scrape each page
    for path in content["pages"]:
        url = content["pages"][path]["url"]
        print(f"Scraping {url}")
        page_data = scrape_page(url)
        content["pages"][path].update(page_data)

    # Extract specific content
    services = []
    solutions = []
    clients = []

    # Process services
    if "/services" in content["pages"]:
        for section in content["pages"]["/services"]["sections"]:
            if section["heading"] or section["content"]:
                services.append({
                    "name": section["heading"],
                    "description": section["content"]
                })
    content["services"] = services

    # Process solutions
    if "/solutions" in content["pages"]:
        for section in content["pages"]["/solutions"]["sections"]:
            if section["heading"] or section["content"]:
                solutions.append({
                    "name": section["heading"],
                    "description": section["content"]
                })
    content["solutions"] = solutions

    # Process about page for company info
    if "/about" in content["pages"]:
        about_sections = content["pages"]["/about"]["sections"]
        content["company_info"] = {
            "name": "SletcherSystems",
            "description": "\n".join([
                section["content"] for section in about_sections
                if section["content"]
            ])
        }

    # Save updated content
    with open("data/site_content.json", "w") as f:
        json.dump(content, f, indent=2)

    # Upload to Hugging Face Space
    api = HfApi()
    api.upload_file(
        path_or_fileobj="data/site_content.json",
        path_in_repo="data/site_content.json",
        repo_id="SmokeyBandit/SletcherSystems",
        repo_type="space"
    )

if __name__ == "__main__":
    update_content()