Spaces:

SmokeyBandit
/

SletcherSystems

Sleeping

App Files Files Community

SletcherSystems / update_content.py

SmokeyBandit

Create update_content.py

7df492c verified 3 months ago

raw

history blame

3.68 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	from datetime import datetime
	from huggingface_hub import HfApi

	def clean_text(text):
	if text:
	return " ".join(text.strip().split())
	return ""

	def scrape_page(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	sections = []
	for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
	section_data = {"heading": "", "content": ""}

	# Get heading
	heading = section.find(['h1', 'h2', 'h3'])
	if heading:
	section_data["heading"] = clean_text(heading.text)

	# Get content
	paragraphs = section.find_all('p')
	content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)])
	if not content: # If no paragraphs, get all text
	content = clean_text(section.get_text())
	section_data["content"] = content

	if section_data["heading"] or section_data["content"]:
	sections.append(section_data)

	return {
	"title": clean_text(soup.title.string) if soup.title else "",
	"sections": sections
	}
	except Exception as e:
	print(f"Error scraping {url}: {e}")
	return {"title": "", "sections": []}

	def update_content():
	# Load existing structure
	try:
	with open("data/site_content.json", "r") as f:
	content = json.load(f)
	except Exception as e:
	print(f"Error loading JSON: {e}")
	return

	# Update timestamp
	content["timestamp"] = datetime.now().isoformat()

	# Scrape each page
	for path in content["pages"]:
	url = content["pages"][path]["url"]
	print(f"Scraping {url}")
	page_data = scrape_page(url)
	content["pages"][path].update(page_data)

	# Extract specific content
	services = []
	solutions = []
	clients = []

	# Process services
	if "/services" in content["pages"]:
	for section in content["pages"]["/services"]["sections"]:
	if section["heading"] or section["content"]:
	services.append({
	"name": section["heading"],
	"description": section["content"]
	})
	content["services"] = services

	# Process solutions
	if "/solutions" in content["pages"]:
	for section in content["pages"]["/solutions"]["sections"]:
	if section["heading"] or section["content"]:
	solutions.append({
	"name": section["heading"],
	"description": section["content"]
	})
	content["solutions"] = solutions

	# Process about page for company info
	if "/about" in content["pages"]:
	about_sections = content["pages"]["/about"]["sections"]
	content["company_info"] = {
	"name": "SletcherSystems",
	"description": "\n".join([
	section["content"] for section in about_sections
	if section["content"]
	])
	}

	# Save updated content
	with open("data/site_content.json", "w") as f:
	json.dump(content, f, indent=2)

	# Upload to Hugging Face Space
	api = HfApi()
	api.upload_file(
	path_or_fileobj="data/site_content.json",
	path_in_repo="data/site_content.json",
	repo_id="SmokeyBandit/SletcherSystems",
	repo_type="space"
	)

	if __name__ == "__main__":
	update_content()