Spaces:

vibroniic4
/

jobs3

Running

App Files Files Community

vibroniic4 commited on 13 days ago

Commit

da470ec

verified ·

1 Parent(s): 5716426

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -44

app.py CHANGED Viewed

@@ -1,59 +1,85 @@
 # app.py
 import streamlit as st
-from datasets import load_dataset
 import json
-# — UI Setup —
 st.set_page_config(layout="wide")
-st.title("📄 Discord Unveiled Viewer (Streaming)")
-# — Controls —
-mode      = st.sidebar.radio("Mode", ["Sample (fast)", "Stream (live)"])
-limit     = st.sidebar.slider("Messages per page", 10, 500, 100, step=10)
-pages     = st.sidebar.number_input("Pages to load", 1, 10, 1, step=1)
-offset    = st.sidebar.number_input("Start offset", 0, 1_000_000, 0, step=limit)
-fetch_btn = st.sidebar.button("Load Messages")
-if fetch_btn:
-    total_to_fetch = limit * pages
-    with st.spinner("⏳ Fetching messages…"):
         try:
-            if mode == "Sample (fast)":
-                # Quick indexed slice
-                ds = load_dataset(
-                    "SaisExperiments/Discord-Unveiled-Compressed",
-                    split=f"train[{offset}:{offset+total_to_fetch}]"
-                )
-            else:
-                # True streaming—won't download full dataset
-                ds = load_dataset(
-                    "SaisExperiments/Discord-Unveiled-Compressed",
-                    split="train",
-                    streaming=True
-                )
-            messages = []
-            for i, msg in enumerate(ds):
-                if i < offset:
-                    continue
-                if len(messages) >= total_to_fetch:
-                    break
-                messages.append(msg)
         except Exception as e:
-            st.error(f"❌ Failed to load dataset: {e}")
             st.stop()
-    st.success(f"✅ Loaded {len(messages)} messages (offset {offset}).")
-    # — Pagination & Display —
-    total    = len(messages)
     max_page = (total - 1) // limit + 1
-    page     = st.number_input("Display page", 1, max_page, 1, key="page_select")
-    lo, hi   = (page - 1) * limit, (page - 1) * limit + limit
-    st.markdown(f"### Showing messages **{lo+1}–{min(hi,total)}** of **{total}** (page {page}/{max_page})")
-    for msg in messages[lo:hi]:
         user    = msg.get("author", {}).get("username", "Unknown")
-        content = msg.get("content", "")
         st.markdown(f"**{user}**: {content}")
 else:
-    st.info("👉 Select options in the sidebar and click **Load Messages**.")

 # app.py
 import streamlit as st
+import requests
+import zstandard as zstd
 import json
+from io import TextIOWrapper
+from huggingface_hub import HfApi, hf_hub_url
+# ─ App config ─
 st.set_page_config(layout="wide")
+st.title("🚀 Discord Unveiled Auto-Streamer")
+# ─ Sidebar controls ─
+limit = st.sidebar.slider("Messages per page", 100, 5000, 1000, step=100)
+page  = st.sidebar.number_input("Page number", 1, 1, step=1)
+# ─ Discover the first ZST shard ─
+@st.cache_data(show_spinner=False)
+def get_first_shard_url():
+    api   = HfApi()
+    files = api.list_repo_files(
+        repo_id="SaisExperiments/Discord-Unveiled-Compressed",
+        repo_type="dataset"
+    )
+    shards = sorted(f for f in files if f.endswith(".zst"))
+    if not shards:
+        raise RuntimeError("No .zst shards found in the dataset.")
+    shard = shards[0]
+    # build raw resolve URL
+    url = f"https://huggingface.co/datasets/SaisExperiments/Discord-Unveiled-Compressed/resolve/main/{shard}"
+    return shard, url
+# ─ Stream and parse ZST ─
+@st.cache_data(show_spinner=False)
+def stream_zst(url: str, max_msgs: int):
+    r = requests.get(url, stream=True)
+    r.raise_for_status()
+    dctx     = zstd.ZstdDecompressor()
+    messages = []
+    with dctx.stream_reader(r.raw) as reader:
+        text = TextIOWrapper(reader, encoding="utf-8")
+        for i, line in enumerate(text):
+            if i >= max_msgs:
+                break
+            try:
+                messages.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return messages
+# ─ Main ─
+if st.button("⚡ Load & Stream Discord Messages"):
+    # 1) discover shard
+    with st.spinner("🔍 Discovering ZST shard…"):
         try:
+            shard_name, shard_url = get_first_shard_url()
+            st.success(f"✅ Found shard: **{shard_name}**")
+            st.write("URL:", shard_url)
         except Exception as e:
+            st.error(f"❌ Shard discovery failed: {e}")
             st.stop()
+    # 2) fetch & cache messages
+    total_to_fetch = limit * page
+    with st.spinner("📥 Streaming & decompressing…"):
+        try:
+            all_msgs = stream_zst(shard_url, total_to_fetch)
+        except Exception as e:
+            st.error(f"❌ Streaming failed: {e}")
+            st.stop()
+    # 3) paginate & display
+    total    = len(all_msgs)
     max_page = (total - 1) // limit + 1
+    current  = min(page, max_page)
+    start    = (current - 1) * limit
+    end      = start + limit
+    st.markdown(f"### Showing messages **{start+1}–{min(end,total)}** of **{total}**, page **{current}/{max_page}**")
+    for msg in all_msgs[start:end]:
         user    = msg.get("author", {}).get("username", "Unknown")
+        content = msg.get("content","")
         st.markdown(f"**{user}**: {content}")
 else:
+    st.info("👉 Click **Load & Stream Discord Messages** to begin.")