# app.py import streamlit as st import requests import zstandard as zstd import json from io import TextIOWrapper from huggingface_hub import HfApi, hf_hub_url # ─ App config ─ st.set_page_config(layout="wide") st.title("πŸš€ Discord Unveiled Auto-Streamer") # ─ Sidebar controls ─ limit = st.sidebar.slider("Messages per page", 100, 5000, 1000, step=100) page = st.sidebar.number_input("Page number", 1, 1, step=1) # ─ Discover the first ZST shard ─ @st.cache_data(show_spinner=False) def get_first_shard_url(): api = HfApi() files = api.list_repo_files( repo_id="SaisExperiments/Discord-Unveiled-Compressed", repo_type="dataset" ) shards = sorted(f for f in files if f.endswith(".zst")) if not shards: raise RuntimeError("No .zst shards found in the dataset.") shard = shards[0] # build raw resolve URL url = f"https://huggingface.co/datasets/SaisExperiments/Discord-Unveiled-Compressed/resolve/main/{shard}" return shard, url # ─ Stream and parse ZST ─ @st.cache_data(show_spinner=False) def stream_zst(url: str, max_msgs: int): r = requests.get(url, stream=True) r.raise_for_status() dctx = zstd.ZstdDecompressor() messages = [] with dctx.stream_reader(r.raw) as reader: text = TextIOWrapper(reader, encoding="utf-8") for i, line in enumerate(text): if i >= max_msgs: break try: messages.append(json.loads(line)) except json.JSONDecodeError: continue return messages # ─ Main ─ if st.button("⚑ Load & Stream Discord Messages"): # 1) discover shard with st.spinner("πŸ” Discovering ZST shard…"): try: shard_name, shard_url = get_first_shard_url() st.success(f"βœ… Found shard: **{shard_name}**") st.write("URL:", shard_url) except Exception as e: st.error(f"❌ Shard discovery failed: {e}") st.stop() # 2) fetch & cache messages total_to_fetch = limit * page with st.spinner("πŸ“₯ Streaming & decompressing…"): try: all_msgs = stream_zst(shard_url, total_to_fetch) except Exception as e: st.error(f"❌ Streaming failed: {e}") st.stop() # 3) paginate & display total = len(all_msgs) max_page = (total - 1) // limit + 1 current = min(page, max_page) start = (current - 1) * limit end = start + limit st.markdown(f"### Showing messages **{start+1}–{min(end,total)}** of **{total}**, page **{current}/{max_page}**") for msg in all_msgs[start:end]: user = msg.get("author", {}).get("username", "Unknown") content = msg.get("content","") st.markdown(f"**{user}**: {content}") else: st.info("πŸ‘‰ Click **Load & Stream Discord Messages** to begin.")