Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Serve a Hugging Face dataset. | |
""" | |
import dataclasses | |
import os | |
from typing import Optional | |
import datasets | |
import huggingface_hub | |
from renumics import spotlight # type: ignore | |
def login() -> None: | |
""" | |
Login to Hugging Face Hub. | |
""" | |
if token := os.environ.get("HF_TOKEN"): | |
huggingface_hub.login(token) | |
class HFSettings: | |
""" | |
Hugging Face settings. | |
""" | |
dataset: str | |
subset: Optional[str] = None | |
split: Optional[str] = None | |
revision: Optional[str] = None | |
enrichment: Optional[str] = None | |
enrichment_revision: Optional[str] = None | |
def from_environ(cls) -> "HFSettings": | |
""" | |
Parse Hugging Face settings from environment. | |
""" | |
dataset = os.environ.get("HF_DATASET") or None | |
if dataset is None: | |
raise RuntimeError( | |
"Desired Hugging Face dataset must be set as `HF_DATASET` " | |
"environment variable." | |
) | |
return cls( | |
dataset, | |
os.environ.get("HF_SUBSET") or None, | |
os.environ.get("HF_SPLIT") or None, | |
os.environ.get("HF_REVISION") or None, | |
os.environ.get("HF_ENRICHMENT") or None, | |
os.environ.get("HF_ENRICHMENT_REVISION") or None, | |
) | |
def __str__(self) -> str: | |
return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]" | |
if __name__ == "__main__": | |
""" | |
Load and serve the given Hugging Face dataset. | |
""" | |
login() | |
hf_settings = HFSettings.from_environ() | |
print(f"Loading Hugging Face dataset {hf_settings}.") | |
ds = datasets.load_dataset( | |
hf_settings.dataset, | |
hf_settings.subset, | |
split=hf_settings.split, | |
revision=hf_settings.revision, | |
) | |
if hf_settings.enrichment is not None: | |
ds_enrichment = datasets.load_dataset( | |
hf_settings.enrichment, | |
hf_settings.subset, | |
split=hf_settings.split, | |
revision=hf_settings.enrichment_revision, | |
) | |
if len(ds_enrichment) != len(ds): | |
raise RuntimeError( | |
f"Length of the enrichment dataset ({len(ds_enrichment)}) " | |
f"mismatches length of the original dataset ({len(ds)})" | |
) | |
ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1) | |
dtypes = {} | |
for col in ds.column_names: | |
if "embedding" in col and isinstance(ds.features[col], datasets.Sequence): | |
dtypes[col] = spotlight.dtypes.embedding_dtype | |
if not isinstance(ds, datasets.Dataset): | |
raise TypeError( | |
f"Loaded Hugging Face dataset is of type {type(ds)} instead of " | |
"`datasets.Dataset`. Did you forget to specify subset and/or split " | |
"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?" | |
) | |
print(f"Serving Hugging Face dataset {hf_settings}.") | |
spotlight.show( | |
ds, host="0.0.0.0", port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False | |
) | |