sumuks HF Staff commited on
Commit
0ae507e
·
verified ·
1 Parent(s): f3d8e60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -51
app.py CHANGED
@@ -1,13 +1,14 @@
1
-
 
2
  import hashlib
3
  import json
4
  import os
5
  import uuid
6
- from collections import defaultdict, deque
7
  from dataclasses import dataclass, field
8
  from datetime import datetime, timezone
9
  from pathlib import Path
10
- from random import randint, randrange
11
 
12
  import gradio as gr
13
  from datasets import Dataset, load_dataset
@@ -19,9 +20,7 @@ def doc_hash(url: str, text: str) -> str:
19
 
20
 
21
  def filterfunc(x: dict) -> bool:
22
- # text length
23
  if len(x.get("text", "").split()) < 100:
24
- # very short content usually means it's not a high quality document
25
  return False
26
 
27
  excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
@@ -36,15 +35,13 @@ def filterfunc(x: dict) -> bool:
36
 
37
 
38
  class DocLoader:
39
- __slots__ = ("queue", "k", "counts", "processed", "total_docs", "_dataset")
40
 
41
- def __init__(self, processed: set[str], k: int = 20):
42
- self.queue = deque()
43
- self.k = k
44
- self.counts = defaultdict(int)
45
  self.processed = processed
46
- self.total_docs = 0
47
- self._dataset = None
 
48
  self._load()
49
 
50
  def _load(self):
@@ -54,34 +51,38 @@ class DocLoader:
54
  ds = ds.filter(filterfunc)
55
  logger.info(f"Filtered to {len(ds)} documents")
56
 
57
- self._dataset = {}
 
 
58
  for idx, doc in enumerate(ds):
59
  doc_key = doc.get("id", idx)
60
  doc_with_key = dict(doc)
61
  doc_with_key["_dataset_key"] = doc_key
62
  self._dataset[doc_key] = doc_with_key
63
-
64
- for doc_id, doc in self._dataset.items():
65
  url = doc.get("metadata", {}).get("url", doc.get("url", ""))
66
  h = doc_hash(url, doc.get("text", ""))
67
 
68
- if h in self.processed:
69
- continue
70
-
71
- if cat := doc.get("eai_taxonomy", {}).get("document_type_v2", {}).get("primary", {}).get("label"):
72
- min_count = min(self.counts.values(), default=0)
73
- if self.counts[cat] <= min_count or randrange(self.k) == 0:
74
- self.queue.append(doc)
75
- self.counts[cat] += 1
76
 
77
- self.total_docs = len(self.queue)
78
- logger.info(f"Loaded {self.total_docs} documents")
 
 
 
 
 
79
 
80
  def next(self) -> dict | None:
81
- return self.queue.popleft() if self.queue else None
 
 
 
 
82
 
83
  def get_by_id(self, doc_id: str | int) -> dict | None:
84
- # Handle both string and int lookups
85
  result = self._dataset.get(doc_id)
86
  if result is None and isinstance(doc_id, str) and doc_id.isdigit():
87
  result = self._dataset.get(int(doc_id))
@@ -91,7 +92,7 @@ class DocLoader:
91
 
92
  @property
93
  def remaining(self) -> int:
94
- return len(self.queue)
95
 
96
 
97
  @dataclass(slots=True)
@@ -99,7 +100,7 @@ class AnnotationStore:
99
  path: Path
100
  session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
101
  buffer: list[dict] = field(default_factory=list)
102
- threshold: int = field(default_factory=lambda: randint(20, 30))
103
  processed: set[str] = field(default_factory=set)
104
  annotations: list[dict] = field(default_factory=list)
105
  session_stats: dict = field(default_factory=lambda: {
@@ -117,11 +118,7 @@ class AnnotationStore:
117
  if rec := self._parse_line(line):
118
  self.processed.add(rec["hash"])
119
  self.annotations.append(rec)
120
- # Initialize session stats for loaded annotations
121
- if "decision" in rec:
122
- decision = rec["decision"]
123
- if decision not in self.session_stats:
124
- self.session_stats[decision] = 0
125
 
126
  def _parse_line(self, line: str) -> dict | None:
127
  try:
@@ -131,6 +128,7 @@ class AnnotationStore:
131
 
132
  def add(self, doc_hash: str, decision: str, doc_id: str | int):
133
  if doc_hash in self.processed:
 
134
  return
135
 
136
  rec = {
@@ -147,9 +145,10 @@ class AnnotationStore:
147
  self.annotations.append(rec)
148
 
149
  self.session_stats["total"] += 1
150
- if decision not in self.session_stats:
151
- self.session_stats[decision] = 0
152
- self.session_stats[decision] += 1
 
153
  self.session_stats["decisions"].append((datetime.now(timezone.utc), decision))
154
 
155
  if len(self.buffer) >= self.threshold:
@@ -158,7 +157,6 @@ class AnnotationStore:
158
  def flush(self):
159
  if not self.buffer or not (token := os.getenv("HF_TOKEN")):
160
  self.buffer.clear()
161
- self.threshold = randint(20, 30)
162
  return
163
 
164
  try:
@@ -170,8 +168,6 @@ class AnnotationStore:
170
  self.buffer.clear()
171
  except Exception as e:
172
  logger.error(f"Push failed: {e}")
173
- finally:
174
- self.threshold = randint(20, 30)
175
 
176
  def get_rate(self) -> float:
177
  if not self.session_stats["decisions"]:
@@ -191,7 +187,6 @@ store = AnnotationStore(Path("data/annotations.jsonl"))
191
  loader = DocLoader(store.processed)
192
  current = loader.next()
193
 
194
-
195
  # Viewer state
196
  viewer_state = {
197
  "annotations": [],
@@ -204,9 +199,6 @@ def format_stats() -> str:
204
  stats = store.session_stats
205
  rate = store.get_rate()
206
 
207
- selected_count = stats.get('selected', 0)
208
- discarded_count = stats.get('discarded', 0)
209
-
210
  return f"""
211
  <div class="stats-container">
212
  <div class="stat-item">
@@ -214,17 +206,21 @@ def format_stats() -> str:
214
  <div class="stat-label">Total Annotated</div>
215
  </div>
216
  <div class="stat-item">
217
- <div class="stat-value">{selected_count}</div>
218
  <div class="stat-label">Selected</div>
219
  </div>
220
  <div class="stat-item">
221
- <div class="stat-value">{discarded_count}</div>
222
  <div class="stat-label">Discarded</div>
223
  </div>
224
  <div class="stat-item">
225
  <div class="stat-value">{rate:.0f}/hr</div>
226
  <div class="stat-label">Annotation Rate</div>
227
  </div>
 
 
 
 
228
  </div>
229
  """
230
 
@@ -348,10 +344,8 @@ def update_viewer_filter(filter_value: str):
348
  viewer_state["filter"] = filter_value
349
  viewer_state["index"] = 0
350
 
351
- # Get filtered annotations
352
  viewer_state["annotations"] = store.get_filtered(filter_value)
353
 
354
- # Log for debugging
355
  logger.info(f"Filter: {filter_value}, Found {len(viewer_state['annotations'])} annotations")
356
 
357
  return update_viewer_display()
@@ -380,7 +374,6 @@ def update_viewer_display():
380
  doc = loader.get_by_id(annotation["id"])
381
 
382
  if not doc:
383
- # Log the issue for debugging
384
  logger.warning(f"Document not found for ID: {annotation['id']} (type: {type(annotation['id'])})")
385
  return (
386
  "<div class='viewer-error'>Document not found in dataset</div>",
@@ -714,7 +707,6 @@ def build() -> gr.Blocks:
714
  viewer_counter = gr.HTML("<div class='viewer-counter'>0 / 0</div>")
715
  next_btn = gr.Button("Next →", size="lg")
716
 
717
- # Initialize viewer
718
  filter_dropdown.change(
719
  update_viewer_filter,
720
  inputs=[filter_dropdown],
@@ -731,7 +723,6 @@ def build() -> gr.Blocks:
731
  outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
732
  )
733
 
734
- # Load initial viewer state
735
  demo.load(
736
  lambda: update_viewer_filter("all"),
737
  outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
 
1
+ #!/usr/bin/env python3
2
+ """Enhanced web document annotation tool with modern UI."""
3
  import hashlib
4
  import json
5
  import os
6
  import uuid
7
+ from collections import defaultdict
8
  from dataclasses import dataclass, field
9
  from datetime import datetime, timezone
10
  from pathlib import Path
11
+ from random import sample, shuffle
12
 
13
  import gradio as gr
14
  from datasets import Dataset, load_dataset
 
20
 
21
 
22
  def filterfunc(x: dict) -> bool:
 
23
  if len(x.get("text", "").split()) < 100:
 
24
  return False
25
 
26
  excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
 
35
 
36
 
37
  class DocLoader:
38
+ __slots__ = ("docs", "index", "processed", "_dataset")
39
 
40
+ def __init__(self, processed: set[str]):
 
 
 
41
  self.processed = processed
42
+ self.index = 0
43
+ self.docs = []
44
+ self._dataset = {}
45
  self._load()
46
 
47
  def _load(self):
 
51
  ds = ds.filter(filterfunc)
52
  logger.info(f"Filtered to {len(ds)} documents")
53
 
54
+ # Build dataset lookup and collect unprocessed docs
55
+ unprocessed = []
56
+
57
  for idx, doc in enumerate(ds):
58
  doc_key = doc.get("id", idx)
59
  doc_with_key = dict(doc)
60
  doc_with_key["_dataset_key"] = doc_key
61
  self._dataset[doc_key] = doc_with_key
62
+
63
+ # Check if already processed
64
  url = doc.get("metadata", {}).get("url", doc.get("url", ""))
65
  h = doc_hash(url, doc.get("text", ""))
66
 
67
+ if h not in self.processed:
68
+ unprocessed.append(doc_with_key)
 
 
 
 
 
 
69
 
70
+ logger.info(f"Found {len(unprocessed)} unprocessed documents")
71
+
72
+ # Randomize the order for this session
73
+ shuffle(unprocessed)
74
+ self.docs = unprocessed
75
+
76
+ logger.info(f"Loaded {len(self.docs)} documents for this session")
77
 
78
  def next(self) -> dict | None:
79
+ if self.index < len(self.docs):
80
+ doc = self.docs[self.index]
81
+ self.index += 1
82
+ return doc
83
+ return None
84
 
85
  def get_by_id(self, doc_id: str | int) -> dict | None:
 
86
  result = self._dataset.get(doc_id)
87
  if result is None and isinstance(doc_id, str) and doc_id.isdigit():
88
  result = self._dataset.get(int(doc_id))
 
92
 
93
  @property
94
  def remaining(self) -> int:
95
+ return max(0, len(self.docs) - self.index)
96
 
97
 
98
  @dataclass(slots=True)
 
100
  path: Path
101
  session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
102
  buffer: list[dict] = field(default_factory=list)
103
+ threshold: int = 25
104
  processed: set[str] = field(default_factory=set)
105
  annotations: list[dict] = field(default_factory=list)
106
  session_stats: dict = field(default_factory=lambda: {
 
118
  if rec := self._parse_line(line):
119
  self.processed.add(rec["hash"])
120
  self.annotations.append(rec)
121
+ logger.info(f"Loaded {len(self.processed)} existing annotations")
 
 
 
 
122
 
123
  def _parse_line(self, line: str) -> dict | None:
124
  try:
 
128
 
129
  def add(self, doc_hash: str, decision: str, doc_id: str | int):
130
  if doc_hash in self.processed:
131
+ logger.warning(f"Attempted to add already processed document: {doc_hash}")
132
  return
133
 
134
  rec = {
 
145
  self.annotations.append(rec)
146
 
147
  self.session_stats["total"] += 1
148
+ if decision == "selected":
149
+ self.session_stats["selected"] += 1
150
+ elif decision == "discarded":
151
+ self.session_stats["discarded"] += 1
152
  self.session_stats["decisions"].append((datetime.now(timezone.utc), decision))
153
 
154
  if len(self.buffer) >= self.threshold:
 
157
  def flush(self):
158
  if not self.buffer or not (token := os.getenv("HF_TOKEN")):
159
  self.buffer.clear()
 
160
  return
161
 
162
  try:
 
168
  self.buffer.clear()
169
  except Exception as e:
170
  logger.error(f"Push failed: {e}")
 
 
171
 
172
  def get_rate(self) -> float:
173
  if not self.session_stats["decisions"]:
 
187
  loader = DocLoader(store.processed)
188
  current = loader.next()
189
 
 
190
  # Viewer state
191
  viewer_state = {
192
  "annotations": [],
 
199
  stats = store.session_stats
200
  rate = store.get_rate()
201
 
 
 
 
202
  return f"""
203
  <div class="stats-container">
204
  <div class="stat-item">
 
206
  <div class="stat-label">Total Annotated</div>
207
  </div>
208
  <div class="stat-item">
209
+ <div class="stat-value">{stats['selected']}</div>
210
  <div class="stat-label">Selected</div>
211
  </div>
212
  <div class="stat-item">
213
+ <div class="stat-value">{stats['discarded']}</div>
214
  <div class="stat-label">Discarded</div>
215
  </div>
216
  <div class="stat-item">
217
  <div class="stat-value">{rate:.0f}/hr</div>
218
  <div class="stat-label">Annotation Rate</div>
219
  </div>
220
+ <div class="stat-item">
221
+ <div class="stat-value">{loader.remaining:,}</div>
222
+ <div class="stat-label">Remaining Docs</div>
223
+ </div>
224
  </div>
225
  """
226
 
 
344
  viewer_state["filter"] = filter_value
345
  viewer_state["index"] = 0
346
 
 
347
  viewer_state["annotations"] = store.get_filtered(filter_value)
348
 
 
349
  logger.info(f"Filter: {filter_value}, Found {len(viewer_state['annotations'])} annotations")
350
 
351
  return update_viewer_display()
 
374
  doc = loader.get_by_id(annotation["id"])
375
 
376
  if not doc:
 
377
  logger.warning(f"Document not found for ID: {annotation['id']} (type: {type(annotation['id'])})")
378
  return (
379
  "<div class='viewer-error'>Document not found in dataset</div>",
 
707
  viewer_counter = gr.HTML("<div class='viewer-counter'>0 / 0</div>")
708
  next_btn = gr.Button("Next →", size="lg")
709
 
 
710
  filter_dropdown.change(
711
  update_viewer_filter,
712
  inputs=[filter_dropdown],
 
723
  outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
724
  )
725
 
 
726
  demo.load(
727
  lambda: update_viewer_filter("all"),
728
  outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]