import requests from bs4 import BeautifulSoup import json from datetime import datetime from huggingface_hub import HfApi def clean_text(text): if text: return " ".join(text.strip().split()) return "" def scrape_page(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') sections = [] for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()): section_data = {"heading": "", "content": ""} # Get heading heading = section.find(['h1', 'h2', 'h3']) if heading: section_data["heading"] = clean_text(heading.text) # Get content paragraphs = section.find_all('p') content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)]) if not content: # If no paragraphs, get all text content = clean_text(section.get_text()) section_data["content"] = content if section_data["heading"] or section_data["content"]: sections.append(section_data) return { "title": clean_text(soup.title.string) if soup.title else "", "sections": sections } except Exception as e: print(f"Error scraping {url}: {e}") return {"title": "", "sections": []} def update_content(): # Load existing structure try: with open("data/site_content.json", "r") as f: content = json.load(f) except Exception as e: print(f"Error loading JSON: {e}") return # Update timestamp content["timestamp"] = datetime.now().isoformat() # Scrape each page for path in content["pages"]: url = content["pages"][path]["url"] print(f"Scraping {url}") page_data = scrape_page(url) content["pages"][path].update(page_data) # Extract specific content services = [] solutions = [] clients = [] # Process services if "/services" in content["pages"]: for section in content["pages"]["/services"]["sections"]: if section["heading"] or section["content"]: services.append({ "name": section["heading"], "description": section["content"] }) content["services"] = services # Process solutions if "/solutions" in content["pages"]: for section in content["pages"]["/solutions"]["sections"]: if section["heading"] or section["content"]: solutions.append({ "name": section["heading"], "description": section["content"] }) content["solutions"] = solutions # Process about page for company info if "/about" in content["pages"]: about_sections = content["pages"]["/about"]["sections"] content["company_info"] = { "name": "SletcherSystems", "description": "\n".join([ section["content"] for section in about_sections if section["content"] ]) } # Save updated content with open("data/site_content.json", "w") as f: json.dump(content, f, indent=2) # Upload to Hugging Face Space api = HfApi() api.upload_file( path_or_fileobj="data/site_content.json", path_in_repo="data/site_content.json", repo_id="SmokeyBandit/SletcherSystems", repo_type="space" ) if __name__ == "__main__": update_content()