spotlight-cloome-demo

Sleeping

App Files Files Community

spotlight-cloome-demo / run.py

sps44

Update run.py

0847ca6 12 months ago

raw

history blame contribute delete

3.16 kB

	#!/usr/bin/env python3
	"""
	Serve a Hugging Face dataset.
	"""

	import dataclasses
	import os
	from typing import Optional

	import datasets
	import huggingface_hub
	from renumics import spotlight # type: ignore


	def login() -> None:
	"""
	Login to Hugging Face Hub.
	"""
	if token := os.environ.get("HF_TOKEN"):
	huggingface_hub.login(token)


	@dataclasses.dataclass
	class HFSettings:
	"""
	Hugging Face settings.
	"""

	dataset: str
	subset: Optional[str] = None
	split: Optional[str] = None
	revision: Optional[str] = None

	enrichment: Optional[str] = None
	enrichment_revision: Optional[str] = None

	@classmethod
	def from_environ(cls) -> "HFSettings":
	"""
	Parse Hugging Face settings from environment.
	"""
	dataset = os.environ.get("HF_DATASET") or None
	if dataset is None:
	raise RuntimeError(
	"Desired Hugging Face dataset must be set as `HF_DATASET` "
	"environment variable."
	)
	return cls(
	dataset,
	os.environ.get("HF_SUBSET") or None,
	os.environ.get("HF_SPLIT") or None,
	os.environ.get("HF_REVISION") or None,
	os.environ.get("HF_ENRICHMENT") or None,
	os.environ.get("HF_ENRICHMENT_REVISION") or None,
	)

	def __str__(self) -> str:
	return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"


	if __name__ == "__main__":
	"""
	Load and serve the given Hugging Face dataset.
	"""
	login()

	hf_settings = HFSettings.from_environ()
	print(f"Loading Hugging Face dataset {hf_settings}.")
	ds = datasets.load_dataset(
	hf_settings.dataset,
	hf_settings.subset,
	split=hf_settings.split,
	revision=hf_settings.revision,
	)
	if hf_settings.enrichment is not None:
	ds_enrichment = datasets.load_dataset(
	hf_settings.enrichment,
	hf_settings.subset,
	split=hf_settings.split,
	revision=hf_settings.enrichment_revision,
	)
	if len(ds_enrichment) != len(ds):
	raise RuntimeError(
	f"Length of the enrichment dataset ({len(ds_enrichment)}) "
	f"mismatches length of the original dataset ({len(ds)})"
	)
	ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)

	dtypes = {}
	for col in ds.column_names:
	if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
	dtypes[col] = spotlight.dtypes.embedding_dtype

	if not isinstance(ds, datasets.Dataset):
	raise TypeError(
	f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
	"`datasets.Dataset`. Did you forget to specify subset and/or split "
	"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
	)
	print(f"Serving Hugging Face dataset {hf_settings}.")


	spotlight.show(
	ds, host="0.0.0.0", port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False
	)