Jupiter-FAQ-streamlit / jupiter_help_scraper.py
bhutesh65's picture
Upload 12 files
001593c verified
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import time
BASE_URL = "https://community.jupiter.money/c/help/27"
CATEGORY_URL = f"{BASE_URL}/c/help/27.json" # JSON endpoint of the Help category
def fetch_topic_urls():
res = requests.get(CATEGORY_URL)
data = res.json()
topic_urls = [f"{BASE_URL}/t/{topic['slug']}/{topic['id']}" for topic in data['topic_list']['topics']]
return topic_urls
def scrape_topic(url):
topic_id = url.split("/")[-1]
topic_json_url = f"https://community.jupiter.money/t/{topic_id}.json"
res = requests.get(topic_json_url)
data = res.json()
question = data['title']
posts = data['post_stream']['posts']
if not posts or len(posts) == 0:
return None
# First post = original question or context
first_post = posts[0]['cooked']
# Next post = usually the first answer
if len(posts) > 1:
answer_post = posts[1]['cooked']
else:
answer_post = first_post
# Remove HTML tags
from bs4 import BeautifulSoup
q_clean = BeautifulSoup(first_post, "html.parser").get_text()
a_clean = BeautifulSoup(answer_post, "html.parser").get_text()
return {
"url": url,
"question": question,
"context": q_clean.strip(),
"answer": a_clean.strip()
}
def main():
topic_urls = fetch_topic_urls()
print(f"Found {len(topic_urls)} topics.")
faqs = []
for url in tqdm(topic_urls):
try:
faq = scrape_topic(url)
if faq:
faqs.append(faq)
time.sleep(1) # Avoid hitting rate limits
except Exception as e:
print(f"Error scraping {url}: {e}")
# Save as JSON
with open("jupiter_help_faqs.json", "w", encoding="utf-8") as f:
json.dump(faqs, f, indent=2, ensure_ascii=False)
print("✅ Scraping complete. Saved to jupiter_help_faqs.json")
if __name__ == "__main__":
main()