sps44's picture
Update run.py
0847ca6
#!/usr/bin/env python3
"""
Serve a Hugging Face dataset.
"""
import dataclasses
import os
from typing import Optional
import datasets
import huggingface_hub
from renumics import spotlight # type: ignore
def login() -> None:
"""
Login to Hugging Face Hub.
"""
if token := os.environ.get("HF_TOKEN"):
huggingface_hub.login(token)
@dataclasses.dataclass
class HFSettings:
"""
Hugging Face settings.
"""
dataset: str
subset: Optional[str] = None
split: Optional[str] = None
revision: Optional[str] = None
enrichment: Optional[str] = None
enrichment_revision: Optional[str] = None
@classmethod
def from_environ(cls) -> "HFSettings":
"""
Parse Hugging Face settings from environment.
"""
dataset = os.environ.get("HF_DATASET") or None
if dataset is None:
raise RuntimeError(
"Desired Hugging Face dataset must be set as `HF_DATASET` "
"environment variable."
)
return cls(
dataset,
os.environ.get("HF_SUBSET") or None,
os.environ.get("HF_SPLIT") or None,
os.environ.get("HF_REVISION") or None,
os.environ.get("HF_ENRICHMENT") or None,
os.environ.get("HF_ENRICHMENT_REVISION") or None,
)
def __str__(self) -> str:
return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"
if __name__ == "__main__":
"""
Load and serve the given Hugging Face dataset.
"""
login()
hf_settings = HFSettings.from_environ()
print(f"Loading Hugging Face dataset {hf_settings}.")
ds = datasets.load_dataset(
hf_settings.dataset,
hf_settings.subset,
split=hf_settings.split,
revision=hf_settings.revision,
)
if hf_settings.enrichment is not None:
ds_enrichment = datasets.load_dataset(
hf_settings.enrichment,
hf_settings.subset,
split=hf_settings.split,
revision=hf_settings.enrichment_revision,
)
if len(ds_enrichment) != len(ds):
raise RuntimeError(
f"Length of the enrichment dataset ({len(ds_enrichment)}) "
f"mismatches length of the original dataset ({len(ds)})"
)
ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)
dtypes = {}
for col in ds.column_names:
if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
dtypes[col] = spotlight.dtypes.embedding_dtype
if not isinstance(ds, datasets.Dataset):
raise TypeError(
f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
"`datasets.Dataset`. Did you forget to specify subset and/or split "
"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
)
print(f"Serving Hugging Face dataset {hf_settings}.")
spotlight.show(
ds, host="0.0.0.0", port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False
)