loodvanniekerkginkgo commited on
Commit
3e8741e
·
1 Parent(s): 0768e70

Using readable hashes

Browse files
Files changed (1) hide show
  1. utils.py +118 -1
utils.py CHANGED
@@ -1,7 +1,8 @@
1
  import pandas as pd
2
  from datasets import load_dataset
3
  import gradio as gr
4
-
 
5
  from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
6
 
7
  pd.set_option('display.max_columns', None)
@@ -10,6 +11,10 @@ pd.set_option('display.max_columns', None)
10
  def show_output_box(message):
11
  return gr.update(value=message, visible=True)
12
 
 
 
 
 
13
 
14
  def fetch_hf_results():
15
  # For debugging
@@ -24,4 +29,116 @@ def fetch_hf_results():
24
  # Show latest submission only
25
  df = df.sort_values("submission_time", ascending=False).drop_duplicates(subset=["model", "assay", "user"], keep="first")
26
  df["property"] = df["assay"].map(ASSAY_RENAME)
 
 
 
 
27
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  from datasets import load_dataset
3
  import gradio as gr
4
+ import hashlib
5
+ from typing import Iterable, Union
6
  from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
7
 
8
  pd.set_option('display.max_columns', None)
 
11
  def show_output_box(message):
12
  return gr.update(value=message, visible=True)
13
 
14
+ def anonymize_user(username: str) -> str:
15
+ # Anonymize using a hash of the username
16
+ return hashlib.sha256(username.encode()).hexdigest()[:8]
17
+
18
 
19
  def fetch_hf_results():
20
  # For debugging
 
29
  # Show latest submission only
30
  df = df.sort_values("submission_time", ascending=False).drop_duplicates(subset=["model", "assay", "user"], keep="first")
31
  df["property"] = df["assay"].map(ASSAY_RENAME)
32
+
33
+ # Anonymize the user column at this point
34
+ df.loc[df["anonymous"] != False, "user"] = df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
35
+
36
  return df
37
+
38
+
39
+ # Readable hashing function similar to coolname or codenamize
40
+ ADJECTIVES = [
41
+ "ancient","brave","calm","clever","crimson","curious","dapper","eager",
42
+ "fuzzy","gentle","glowing","golden","happy","icy","jolly","lucky",
43
+ "magical","mellow","nimble","peachy","quick","royal","shiny","silent",
44
+ "sly","sparkly","spicy","spry","sturdy","sunny","swift","tiny","vivid",
45
+ "witty"
46
+ ]
47
+
48
+ ANIMALS = [
49
+ "ant","bat","bear","bee","bison","boar","bug","cat","crab","crow",
50
+ "deer","dog","duck","eel","elk","fox","frog","goat","gull","hare",
51
+ "hawk","hen","horse","ibis","kid","kiwi","koala","lamb","lark","lemur",
52
+ "lion","llama","loon","lynx","mole","moose","mouse","newt","otter","owl",
53
+ "ox","panda","pig","prawn","puma","quail","quokka","rabbit","rat","ray",
54
+ "robin","seal","shark","sheep","shrew","skunk","slug","snail","snake",
55
+ "swan","toad","trout","turtle","vole","walrus","wasp","whale","wolf",
56
+ "worm","yak","zebra"
57
+ ]
58
+ NOUNS = [
59
+ "rock","sand","star","tree","leaf","seed","stone","cloud","rain","snow",
60
+ "wind","fire","ash","dirt","mud","ice","wave","shell","dust","sun",
61
+ "moon","hill","lake","pond","reef","root","twig","wood"
62
+ ]
63
+
64
+
65
+ def readable_hash(
66
+ data: Union[str, bytes, Iterable[int]],
67
+ *,
68
+ salt: Union[str, bytes, None] = None,
69
+ words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS+NOUNS),
70
+ sep: str = "-",
71
+ checksum_len: int = 2, # 0 to disable; 2–3 is plenty
72
+ case: str = "lower" # "lower" | "title" | "upper"
73
+ ) -> str:
74
+ """
75
+ Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.
76
+
77
+ Examples
78
+ --------
79
+ >>> readable_hash("hello world")
80
+ 'magical-panda-6h'
81
+
82
+ >>> readable_hash("hello world", salt="my-app-v1", checksum_len=3)
83
+ 'royal-otter-1pz'
84
+
85
+ >>> readable_hash(b"\x00\x01\x02\x03", case="title", checksum_len=0)
86
+ 'Fuzzy-Tiger'
87
+
88
+ Vocabulary
89
+ ----------
90
+ ADJECTIVES: ~160 safe, descriptive words (e.g. "ancient", "brave", "silent", "swift")
91
+ ANIMALS: ~80 short, common animals (e.g. "dog", "owl", "whale", "tiger")
92
+ NOUNS: optional set of ~30 neutral nouns (e.g. "rock", "star", "tree", "cloud")
93
+
94
+ Combinations
95
+ ------------
96
+ - adjective + animal: ~13,000 unique names
97
+ - adjective + noun: ~5,000 unique names
98
+ - adjective + animal + noun: ~390,000 unique names
99
+
100
+ Checksum
101
+ --------
102
+ An optional short base-36 suffix (e.g. "-6h" or "-1pz"). The checksum
103
+ acts as a disambiguator in case two different inputs map to the same
104
+ word combination. With 2-3 characters, collisions become vanishingly rare.
105
+ If you only need fun, human-readable names, you can disable it by setting
106
+ ``checksum_len=0``. If you need unique, stable identifiers, keep it enabled.
107
+ """
108
+ if isinstance(data, str):
109
+ data = data.encode()
110
+ elif isinstance(data, Iterable) and not isinstance(data, (bytes, bytearray)):
111
+ data = bytes(data)
112
+
113
+ h = hashlib.blake2b(digest_size=8) # fast, stable, short digest
114
+ if salt:
115
+ h.update(salt.encode() if isinstance(salt, str) else salt)
116
+ h.update(b"\x00") # domain-separate salt from data
117
+ h.update(data)
118
+ digest = h.digest()
119
+
120
+ # Use the first 6 bytes to index words; last bytes for checksum
121
+ n1 = int.from_bytes(digest[0:3], "big")
122
+ n2 = int.from_bytes(digest[3:6], "big")
123
+
124
+ adj = words[0][n1 % len(words[0])]
125
+ noun = words[1][n2 % len(words[1])]
126
+ phrase = f"{adj}{sep}{noun}"
127
+
128
+ if checksum_len > 0:
129
+ # Short base36 checksum for collision visibility
130
+ cs = int.from_bytes(digest[6:], "big")
131
+ base36 = ""
132
+ alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
133
+ while cs:
134
+ cs, r = divmod(cs, 36)
135
+ base36 = alphabet[r] + base36
136
+ base36 = (base36 or "0")[:checksum_len]
137
+ phrase = f"{phrase}{sep}{base36}"
138
+
139
+ if case == "title":
140
+ phrase = sep.join(p.capitalize() for p in phrase.split(sep))
141
+ elif case == "upper":
142
+ phrase = phrase.upper()
143
+
144
+ return phrase