Spaces:
Sleeping
Sleeping
File size: 3,679 Bytes
7df492c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from huggingface_hub import HfApi
def clean_text(text):
if text:
return " ".join(text.strip().split())
return ""
def scrape_page(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
sections = []
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
section_data = {"heading": "", "content": ""}
# Get heading
heading = section.find(['h1', 'h2', 'h3'])
if heading:
section_data["heading"] = clean_text(heading.text)
# Get content
paragraphs = section.find_all('p')
content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)])
if not content: # If no paragraphs, get all text
content = clean_text(section.get_text())
section_data["content"] = content
if section_data["heading"] or section_data["content"]:
sections.append(section_data)
return {
"title": clean_text(soup.title.string) if soup.title else "",
"sections": sections
}
except Exception as e:
print(f"Error scraping {url}: {e}")
return {"title": "", "sections": []}
def update_content():
# Load existing structure
try:
with open("data/site_content.json", "r") as f:
content = json.load(f)
except Exception as e:
print(f"Error loading JSON: {e}")
return
# Update timestamp
content["timestamp"] = datetime.now().isoformat()
# Scrape each page
for path in content["pages"]:
url = content["pages"][path]["url"]
print(f"Scraping {url}")
page_data = scrape_page(url)
content["pages"][path].update(page_data)
# Extract specific content
services = []
solutions = []
clients = []
# Process services
if "/services" in content["pages"]:
for section in content["pages"]["/services"]["sections"]:
if section["heading"] or section["content"]:
services.append({
"name": section["heading"],
"description": section["content"]
})
content["services"] = services
# Process solutions
if "/solutions" in content["pages"]:
for section in content["pages"]["/solutions"]["sections"]:
if section["heading"] or section["content"]:
solutions.append({
"name": section["heading"],
"description": section["content"]
})
content["solutions"] = solutions
# Process about page for company info
if "/about" in content["pages"]:
about_sections = content["pages"]["/about"]["sections"]
content["company_info"] = {
"name": "SletcherSystems",
"description": "\n".join([
section["content"] for section in about_sections
if section["content"]
])
}
# Save updated content
with open("data/site_content.json", "w") as f:
json.dump(content, f, indent=2)
# Upload to Hugging Face Space
api = HfApi()
api.upload_file(
path_or_fileobj="data/site_content.json",
path_in_repo="data/site_content.json",
repo_id="SmokeyBandit/SletcherSystems",
repo_type="space"
)
if __name__ == "__main__":
update_content() |