vibroniic4 commited on
Commit
da470ec
·
verified ·
1 Parent(s): 5716426

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -44
app.py CHANGED
@@ -1,59 +1,85 @@
1
  # app.py
2
  import streamlit as st
3
- from datasets import load_dataset
 
4
  import json
 
 
5
 
6
- # UI Setup
7
  st.set_page_config(layout="wide")
8
- st.title("📄 Discord Unveiled Viewer (Streaming)")
9
-
10
- # Controls
11
- mode = st.sidebar.radio("Mode", ["Sample (fast)", "Stream (live)"])
12
- limit = st.sidebar.slider("Messages per page", 10, 500, 100, step=10)
13
- pages = st.sidebar.number_input("Pages to load", 1, 10, 1, step=1)
14
- offset = st.sidebar.number_input("Start offset", 0, 1_000_000, 0, step=limit)
15
- fetch_btn = st.sidebar.button("Load Messages")
16
-
17
- if fetch_btn:
18
- total_to_fetch = limit * pages
19
- with st.spinner("⏳ Fetching messages…"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  try:
21
- if mode == "Sample (fast)":
22
- # Quick indexed slice
23
- ds = load_dataset(
24
- "SaisExperiments/Discord-Unveiled-Compressed",
25
- split=f"train[{offset}:{offset+total_to_fetch}]"
26
- )
27
- else:
28
- # True streaming—won't download full dataset
29
- ds = load_dataset(
30
- "SaisExperiments/Discord-Unveiled-Compressed",
31
- split="train",
32
- streaming=True
33
- )
34
- messages = []
35
- for i, msg in enumerate(ds):
36
- if i < offset:
37
- continue
38
- if len(messages) >= total_to_fetch:
39
- break
40
- messages.append(msg)
41
  except Exception as e:
42
- st.error(f"❌ Failed to load dataset: {e}")
43
  st.stop()
44
 
45
- st.success(f"✅ Loaded {len(messages)} messages (offset {offset}).")
 
 
 
 
 
 
 
46
 
47
- # Pagination & Display —
48
- total = len(messages)
49
  max_page = (total - 1) // limit + 1
50
- page = st.number_input("Display page", 1, max_page, 1, key="page_select")
51
- lo, hi = (page - 1) * limit, (page - 1) * limit + limit
 
52
 
53
- st.markdown(f"### Showing messages **{lo+1}–{min(hi,total)}** of **{total}** (page {page}/{max_page})")
54
- for msg in messages[lo:hi]:
55
  user = msg.get("author", {}).get("username", "Unknown")
56
- content = msg.get("content", "")
57
  st.markdown(f"**{user}**: {content}")
58
  else:
59
- st.info("👉 Select options in the sidebar and click **Load Messages**.")
 
1
  # app.py
2
  import streamlit as st
3
+ import requests
4
+ import zstandard as zstd
5
  import json
6
+ from io import TextIOWrapper
7
+ from huggingface_hub import HfApi, hf_hub_url
8
 
9
+ # App config
10
  st.set_page_config(layout="wide")
11
+ st.title("🚀 Discord Unveiled Auto-Streamer")
12
+
13
+ # Sidebar controls ─
14
+ limit = st.sidebar.slider("Messages per page", 100, 5000, 1000, step=100)
15
+ page = st.sidebar.number_input("Page number", 1, 1, step=1)
16
+
17
+ # Discover the first ZST shard ─
18
+ @st.cache_data(show_spinner=False)
19
+ def get_first_shard_url():
20
+ api = HfApi()
21
+ files = api.list_repo_files(
22
+ repo_id="SaisExperiments/Discord-Unveiled-Compressed",
23
+ repo_type="dataset"
24
+ )
25
+ shards = sorted(f for f in files if f.endswith(".zst"))
26
+ if not shards:
27
+ raise RuntimeError("No .zst shards found in the dataset.")
28
+ shard = shards[0]
29
+ # build raw resolve URL
30
+ url = f"https://huggingface.co/datasets/SaisExperiments/Discord-Unveiled-Compressed/resolve/main/{shard}"
31
+ return shard, url
32
+
33
+ # ─ Stream and parse ZST ─
34
+ @st.cache_data(show_spinner=False)
35
+ def stream_zst(url: str, max_msgs: int):
36
+ r = requests.get(url, stream=True)
37
+ r.raise_for_status()
38
+ dctx = zstd.ZstdDecompressor()
39
+ messages = []
40
+ with dctx.stream_reader(r.raw) as reader:
41
+ text = TextIOWrapper(reader, encoding="utf-8")
42
+ for i, line in enumerate(text):
43
+ if i >= max_msgs:
44
+ break
45
+ try:
46
+ messages.append(json.loads(line))
47
+ except json.JSONDecodeError:
48
+ continue
49
+ return messages
50
+
51
+ # ─ Main ─
52
+ if st.button("⚡ Load & Stream Discord Messages"):
53
+ # 1) discover shard
54
+ with st.spinner("🔍 Discovering ZST shard…"):
55
  try:
56
+ shard_name, shard_url = get_first_shard_url()
57
+ st.success(f"✅ Found shard: **{shard_name}**")
58
+ st.write("URL:", shard_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
+ st.error(f"❌ Shard discovery failed: {e}")
61
  st.stop()
62
 
63
+ # 2) fetch & cache messages
64
+ total_to_fetch = limit * page
65
+ with st.spinner("📥 Streaming & decompressing…"):
66
+ try:
67
+ all_msgs = stream_zst(shard_url, total_to_fetch)
68
+ except Exception as e:
69
+ st.error(f"❌ Streaming failed: {e}")
70
+ st.stop()
71
 
72
+ # 3) paginate & display
73
+ total = len(all_msgs)
74
  max_page = (total - 1) // limit + 1
75
+ current = min(page, max_page)
76
+ start = (current - 1) * limit
77
+ end = start + limit
78
 
79
+ st.markdown(f"### Showing messages **{start+1}–{min(end,total)}** of **{total}**, page **{current}/{max_page}**")
80
+ for msg in all_msgs[start:end]:
81
  user = msg.get("author", {}).get("username", "Unknown")
82
+ content = msg.get("content","")
83
  st.markdown(f"**{user}**: {content}")
84
  else:
85
+ st.info("👉 Click **Load & Stream Discord Messages** to begin.")