Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
def clean_text(text): | |
if text: | |
return " ".join(text.strip().split()) | |
return "" | |
def scrape_page(url): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
sections = [] | |
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()): | |
section_data = {"heading": "", "content": ""} | |
# Get heading | |
heading = section.find(['h1', 'h2', 'h3']) | |
if heading: | |
section_data["heading"] = clean_text(heading.text) | |
# Get content | |
paragraphs = section.find_all('p') | |
content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)]) | |
if not content: # If no paragraphs, get all text | |
content = clean_text(section.get_text()) | |
section_data["content"] = content | |
if section_data["heading"] or section_data["content"]: | |
sections.append(section_data) | |
return { | |
"title": clean_text(soup.title.string) if soup.title else "", | |
"sections": sections | |
} | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
return {"title": "", "sections": []} | |
def update_content(): | |
# Load existing structure | |
try: | |
with open("data/site_content.json", "r") as f: | |
content = json.load(f) | |
except Exception as e: | |
print(f"Error loading JSON: {e}") | |
return | |
# Update timestamp | |
content["timestamp"] = datetime.now().isoformat() | |
# Scrape each page | |
for path in content["pages"]: | |
url = content["pages"][path]["url"] | |
print(f"Scraping {url}") | |
page_data = scrape_page(url) | |
content["pages"][path].update(page_data) | |
# Extract specific content | |
services = [] | |
solutions = [] | |
clients = [] | |
# Process services | |
if "/services" in content["pages"]: | |
for section in content["pages"]["/services"]["sections"]: | |
if section["heading"] or section["content"]: | |
services.append({ | |
"name": section["heading"], | |
"description": section["content"] | |
}) | |
content["services"] = services | |
# Process solutions | |
if "/solutions" in content["pages"]: | |
for section in content["pages"]["/solutions"]["sections"]: | |
if section["heading"] or section["content"]: | |
solutions.append({ | |
"name": section["heading"], | |
"description": section["content"] | |
}) | |
content["solutions"] = solutions | |
# Process about page for company info | |
if "/about" in content["pages"]: | |
about_sections = content["pages"]["/about"]["sections"] | |
content["company_info"] = { | |
"name": "SletcherSystems", | |
"description": "\n".join([ | |
section["content"] for section in about_sections | |
if section["content"] | |
]) | |
} | |
# Save updated content | |
with open("data/site_content.json", "w") as f: | |
json.dump(content, f, indent=2) | |
# Upload to Hugging Face Space | |
api = HfApi() | |
api.upload_file( | |
path_or_fileobj="data/site_content.json", | |
path_in_repo="data/site_content.json", | |
repo_id="SmokeyBandit/SletcherSystems", | |
repo_type="space" | |
) | |
if __name__ == "__main__": | |
update_content() |