Spaces:

bhutesh65
/

Jupiter-FAQ-streamlit

Running

App Files Files Community

Jupiter-FAQ-streamlit / jupiter_help_scraper.py

bhutesh65

Upload 12 files

001593c verified 18 days ago

raw

history blame contribute delete

2.04 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	from tqdm import tqdm
	import time

	BASE_URL = "https://community.jupiter.money/c/help/27"
	CATEGORY_URL = f"{BASE_URL}/c/help/27.json" # JSON endpoint of the Help category

	def fetch_topic_urls():
	res = requests.get(CATEGORY_URL)
	data = res.json()
	topic_urls = [f"{BASE_URL}/t/{topic['slug']}/{topic['id']}" for topic in data['topic_list']['topics']]
	return topic_urls

	def scrape_topic(url):
	topic_id = url.split("/")[-1]
	topic_json_url = f"https://community.jupiter.money/t/{topic_id}.json"

	res = requests.get(topic_json_url)
	data = res.json()

	question = data['title']
	posts = data['post_stream']['posts']

	if not posts or len(posts) == 0:
	return None

	# First post = original question or context
	first_post = posts[0]['cooked']
	# Next post = usually the first answer
	if len(posts) > 1:
	answer_post = posts[1]['cooked']
	else:
	answer_post = first_post

	# Remove HTML tags
	from bs4 import BeautifulSoup
	q_clean = BeautifulSoup(first_post, "html.parser").get_text()
	a_clean = BeautifulSoup(answer_post, "html.parser").get_text()

	return {
	"url": url,
	"question": question,
	"context": q_clean.strip(),
	"answer": a_clean.strip()
	}


	def main():
	topic_urls = fetch_topic_urls()
	print(f"Found {len(topic_urls)} topics.")

	faqs = []
	for url in tqdm(topic_urls):
	try:
	faq = scrape_topic(url)
	if faq:
	faqs.append(faq)
	time.sleep(1) # Avoid hitting rate limits
	except Exception as e:
	print(f"Error scraping {url}: {e}")

	# Save as JSON
	with open("jupiter_help_faqs.json", "w", encoding="utf-8") as f:
	json.dump(faqs, f, indent=2, ensure_ascii=False)

	print("✅ Scraping complete. Saved to jupiter_help_faqs.json")

	if __name__ == "__main__":
	main()