Commit
·
3e8741e
1
Parent(s):
0768e70
Using readable hashes
Browse files
utils.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
from datasets import load_dataset
|
| 3 |
import gradio as gr
|
| 4 |
-
|
|
|
|
| 5 |
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
|
| 6 |
|
| 7 |
pd.set_option('display.max_columns', None)
|
|
@@ -10,6 +11,10 @@ pd.set_option('display.max_columns', None)
|
|
| 10 |
def show_output_box(message):
|
| 11 |
return gr.update(value=message, visible=True)
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def fetch_hf_results():
|
| 15 |
# For debugging
|
|
@@ -24,4 +29,116 @@ def fetch_hf_results():
|
|
| 24 |
# Show latest submission only
|
| 25 |
df = df.sort_values("submission_time", ascending=False).drop_duplicates(subset=["model", "assay", "user"], keep="first")
|
| 26 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from datasets import load_dataset
|
| 3 |
import gradio as gr
|
| 4 |
+
import hashlib
|
| 5 |
+
from typing import Iterable, Union
|
| 6 |
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
|
| 7 |
|
| 8 |
pd.set_option('display.max_columns', None)
|
|
|
|
| 11 |
def show_output_box(message):
|
| 12 |
return gr.update(value=message, visible=True)
|
| 13 |
|
| 14 |
+
def anonymize_user(username: str) -> str:
|
| 15 |
+
# Anonymize using a hash of the username
|
| 16 |
+
return hashlib.sha256(username.encode()).hexdigest()[:8]
|
| 17 |
+
|
| 18 |
|
| 19 |
def fetch_hf_results():
|
| 20 |
# For debugging
|
|
|
|
| 29 |
# Show latest submission only
|
| 30 |
df = df.sort_values("submission_time", ascending=False).drop_duplicates(subset=["model", "assay", "user"], keep="first")
|
| 31 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
| 32 |
+
|
| 33 |
+
# Anonymize the user column at this point
|
| 34 |
+
df.loc[df["anonymous"] != False, "user"] = df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
|
| 35 |
+
|
| 36 |
return df
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Readable hashing function similar to coolname or codenamize
|
| 40 |
+
ADJECTIVES = [
|
| 41 |
+
"ancient","brave","calm","clever","crimson","curious","dapper","eager",
|
| 42 |
+
"fuzzy","gentle","glowing","golden","happy","icy","jolly","lucky",
|
| 43 |
+
"magical","mellow","nimble","peachy","quick","royal","shiny","silent",
|
| 44 |
+
"sly","sparkly","spicy","spry","sturdy","sunny","swift","tiny","vivid",
|
| 45 |
+
"witty"
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
ANIMALS = [
|
| 49 |
+
"ant","bat","bear","bee","bison","boar","bug","cat","crab","crow",
|
| 50 |
+
"deer","dog","duck","eel","elk","fox","frog","goat","gull","hare",
|
| 51 |
+
"hawk","hen","horse","ibis","kid","kiwi","koala","lamb","lark","lemur",
|
| 52 |
+
"lion","llama","loon","lynx","mole","moose","mouse","newt","otter","owl",
|
| 53 |
+
"ox","panda","pig","prawn","puma","quail","quokka","rabbit","rat","ray",
|
| 54 |
+
"robin","seal","shark","sheep","shrew","skunk","slug","snail","snake",
|
| 55 |
+
"swan","toad","trout","turtle","vole","walrus","wasp","whale","wolf",
|
| 56 |
+
"worm","yak","zebra"
|
| 57 |
+
]
|
| 58 |
+
NOUNS = [
|
| 59 |
+
"rock","sand","star","tree","leaf","seed","stone","cloud","rain","snow",
|
| 60 |
+
"wind","fire","ash","dirt","mud","ice","wave","shell","dust","sun",
|
| 61 |
+
"moon","hill","lake","pond","reef","root","twig","wood"
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def readable_hash(
|
| 66 |
+
data: Union[str, bytes, Iterable[int]],
|
| 67 |
+
*,
|
| 68 |
+
salt: Union[str, bytes, None] = None,
|
| 69 |
+
words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS+NOUNS),
|
| 70 |
+
sep: str = "-",
|
| 71 |
+
checksum_len: int = 2, # 0 to disable; 2–3 is plenty
|
| 72 |
+
case: str = "lower" # "lower" | "title" | "upper"
|
| 73 |
+
) -> str:
|
| 74 |
+
"""
|
| 75 |
+
Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.
|
| 76 |
+
|
| 77 |
+
Examples
|
| 78 |
+
--------
|
| 79 |
+
>>> readable_hash("hello world")
|
| 80 |
+
'magical-panda-6h'
|
| 81 |
+
|
| 82 |
+
>>> readable_hash("hello world", salt="my-app-v1", checksum_len=3)
|
| 83 |
+
'royal-otter-1pz'
|
| 84 |
+
|
| 85 |
+
>>> readable_hash(b"\x00\x01\x02\x03", case="title", checksum_len=0)
|
| 86 |
+
'Fuzzy-Tiger'
|
| 87 |
+
|
| 88 |
+
Vocabulary
|
| 89 |
+
----------
|
| 90 |
+
ADJECTIVES: ~160 safe, descriptive words (e.g. "ancient", "brave", "silent", "swift")
|
| 91 |
+
ANIMALS: ~80 short, common animals (e.g. "dog", "owl", "whale", "tiger")
|
| 92 |
+
NOUNS: optional set of ~30 neutral nouns (e.g. "rock", "star", "tree", "cloud")
|
| 93 |
+
|
| 94 |
+
Combinations
|
| 95 |
+
------------
|
| 96 |
+
- adjective + animal: ~13,000 unique names
|
| 97 |
+
- adjective + noun: ~5,000 unique names
|
| 98 |
+
- adjective + animal + noun: ~390,000 unique names
|
| 99 |
+
|
| 100 |
+
Checksum
|
| 101 |
+
--------
|
| 102 |
+
An optional short base-36 suffix (e.g. "-6h" or "-1pz"). The checksum
|
| 103 |
+
acts as a disambiguator in case two different inputs map to the same
|
| 104 |
+
word combination. With 2-3 characters, collisions become vanishingly rare.
|
| 105 |
+
If you only need fun, human-readable names, you can disable it by setting
|
| 106 |
+
``checksum_len=0``. If you need unique, stable identifiers, keep it enabled.
|
| 107 |
+
"""
|
| 108 |
+
if isinstance(data, str):
|
| 109 |
+
data = data.encode()
|
| 110 |
+
elif isinstance(data, Iterable) and not isinstance(data, (bytes, bytearray)):
|
| 111 |
+
data = bytes(data)
|
| 112 |
+
|
| 113 |
+
h = hashlib.blake2b(digest_size=8) # fast, stable, short digest
|
| 114 |
+
if salt:
|
| 115 |
+
h.update(salt.encode() if isinstance(salt, str) else salt)
|
| 116 |
+
h.update(b"\x00") # domain-separate salt from data
|
| 117 |
+
h.update(data)
|
| 118 |
+
digest = h.digest()
|
| 119 |
+
|
| 120 |
+
# Use the first 6 bytes to index words; last bytes for checksum
|
| 121 |
+
n1 = int.from_bytes(digest[0:3], "big")
|
| 122 |
+
n2 = int.from_bytes(digest[3:6], "big")
|
| 123 |
+
|
| 124 |
+
adj = words[0][n1 % len(words[0])]
|
| 125 |
+
noun = words[1][n2 % len(words[1])]
|
| 126 |
+
phrase = f"{adj}{sep}{noun}"
|
| 127 |
+
|
| 128 |
+
if checksum_len > 0:
|
| 129 |
+
# Short base36 checksum for collision visibility
|
| 130 |
+
cs = int.from_bytes(digest[6:], "big")
|
| 131 |
+
base36 = ""
|
| 132 |
+
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
|
| 133 |
+
while cs:
|
| 134 |
+
cs, r = divmod(cs, 36)
|
| 135 |
+
base36 = alphabet[r] + base36
|
| 136 |
+
base36 = (base36 or "0")[:checksum_len]
|
| 137 |
+
phrase = f"{phrase}{sep}{base36}"
|
| 138 |
+
|
| 139 |
+
if case == "title":
|
| 140 |
+
phrase = sep.join(p.capitalize() for p in phrase.split(sep))
|
| 141 |
+
elif case == "upper":
|
| 142 |
+
phrase = phrase.upper()
|
| 143 |
+
|
| 144 |
+
return phrase
|