Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Serve a Hugging Face dataset. | |
| """ | |
| import dataclasses | |
| import os | |
| from typing import Optional | |
| import datasets | |
| import huggingface_hub | |
| from renumics import spotlight # type: ignore | |
| def login() -> None: | |
| """ | |
| Login to Hugging Face Hub. | |
| """ | |
| if token := os.environ.get("HF_TOKEN"): | |
| huggingface_hub.login(token) | |
| class HFSettings: | |
| """ | |
| Hugging Face settings. | |
| """ | |
| dataset: str | |
| subset: Optional[str] = None | |
| split: Optional[str] = None | |
| revision: Optional[str] = None | |
| enrichment: Optional[str] = None | |
| enrichment_revision: Optional[str] = None | |
| def from_environ(cls) -> "HFSettings": | |
| """ | |
| Parse Hugging Face settings from environment. | |
| """ | |
| dataset = os.environ.get("HF_DATASET") or None | |
| if dataset is None: | |
| raise RuntimeError( | |
| "Desired Hugging Face dataset must be set as `HF_DATASET` " | |
| "environment variable." | |
| ) | |
| return cls( | |
| dataset, | |
| os.environ.get("HF_SUBSET") or None, | |
| os.environ.get("HF_SPLIT") or None, | |
| os.environ.get("HF_REVISION") or None, | |
| os.environ.get("HF_ENRICHMENT") or None, | |
| os.environ.get("HF_ENRICHMENT_REVISION") or None, | |
| ) | |
| def __str__(self) -> str: | |
| return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]" | |
| if __name__ == "__main__": | |
| """ | |
| Load and serve the given Hugging Face dataset. | |
| """ | |
| login() | |
| hf_settings = HFSettings.from_environ() | |
| print(f"Loading Hugging Face dataset {hf_settings}.") | |
| ds = datasets.load_dataset( | |
| hf_settings.dataset, | |
| hf_settings.subset, | |
| split=hf_settings.split, | |
| revision=hf_settings.revision, | |
| ) | |
| if hf_settings.enrichment is not None: | |
| ds_enrichment = datasets.load_dataset( | |
| hf_settings.enrichment, | |
| hf_settings.subset, | |
| split=hf_settings.split, | |
| revision=hf_settings.enrichment_revision, | |
| ) | |
| if len(ds_enrichment) != len(ds): | |
| raise RuntimeError( | |
| f"Length of the enrichment dataset ({len(ds_enrichment)}) " | |
| f"mismatches length of the original dataset ({len(ds)})" | |
| ) | |
| ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1) | |
| dtypes = {} | |
| for col in ds.column_names: | |
| if "embedding" in col and isinstance(ds.features[col], datasets.Sequence): | |
| dtypes[col] = spotlight.dtypes.embedding_dtype | |
| layout = spotlight.layout.split( | |
| spotlight.layout.split( | |
| spotlight.layout.tab(spotlight.layout.table(), weight=4), | |
| spotlight.layout.tab( | |
| spotlight.layout.similaritymap(), | |
| spotlight.layout.scatterplot(), | |
| weight=3, | |
| ), | |
| spotlight.layout.tab( | |
| spotlight.layout.histogram(), spotlight.layout.metric(), weight=3 | |
| ), | |
| weight=5, | |
| ), | |
| spotlight.layout.tab(spotlight.layout.inspector(), weight=3), | |
| orientation="vertical", | |
| ) | |
| if not isinstance(ds, datasets.Dataset): | |
| raise TypeError( | |
| f"Loaded Hugging Face dataset is of type {type(ds)} instead of " | |
| "`datasets.Dataset`. Did you forget to specify subset and/or split " | |
| "(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?" | |
| ) | |
| print(f"Serving Hugging Face dataset {hf_settings}.") | |
| spotlight.show( | |
| ds, | |
| host="0.0.0.0", | |
| port=7860, | |
| wait="forever", | |
| dtype=dtypes, | |
| layout=layout, | |
| analyze=True, | |
| no_browser=True, | |
| ) | |