Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import os | |
from huggingface_hub import HfApi, list_datasets | |
from datasets import load_dataset | |
def search_huggingface_datasets(query, limit=20): | |
""" | |
Search for datasets on Hugging Face Hub. | |
Args: | |
query: Search query string | |
limit: Maximum number of results to return | |
Returns: | |
List of dataset metadata | |
""" | |
try: | |
api = HfApi() | |
datasets = list_datasets( | |
filter=query, | |
limit=limit | |
) | |
# Convert to list of dicts with relevant info | |
results = [] | |
for dataset in datasets: | |
results.append({ | |
'id': dataset.id, | |
'name': dataset.id.split('/')[-1], | |
'description': dataset.description or "No description available", | |
'author': dataset.author or "Unknown", | |
'tags': dataset.tags, | |
'downloads': dataset.downloads | |
}) | |
return results | |
except Exception as e: | |
st.error(f"Error searching Hugging Face Hub: {str(e)}") | |
return [] | |
def load_huggingface_dataset(dataset_id, split='train'): | |
""" | |
Load a dataset from Hugging Face Hub. | |
Args: | |
dataset_id: ID of the dataset on HF Hub (e.g., 'mnist', 'glue', etc.) | |
split: Dataset split to load (e.g., 'train', 'test', 'validation') | |
Returns: | |
Pandas DataFrame containing the dataset | |
""" | |
try: | |
# Load the dataset | |
dataset = load_dataset(dataset_id, split=split) | |
# Convert to pandas DataFrame | |
df = dataset.to_pandas() | |
return df | |
except Exception as e: | |
st.error(f"Error loading dataset '{dataset_id}': {str(e)}") | |
raise | |
def upload_to_huggingface(dataset, dataset_name, token=None): | |
""" | |
Upload a dataset to Hugging Face Hub. | |
Args: | |
dataset: Pandas DataFrame to upload | |
dataset_name: Name for the dataset | |
token: Hugging Face API token (optional, will use environment variable if not provided) | |
Returns: | |
URL to the uploaded dataset | |
""" | |
# Get token from environment if not provided | |
if token is None: | |
token = os.getenv("HF_TOKEN") | |
if not token: | |
raise ValueError("No Hugging Face token provided. Set the HF_TOKEN environment variable or pass a token.") | |
try: | |
# Convert to HF dataset | |
from datasets import Dataset | |
hf_dataset = Dataset.from_pandas(dataset) | |
# Upload to HF Hub | |
push_result = hf_dataset.push_to_hub( | |
dataset_name, | |
token=token | |
) | |
return f"https://huggingface.co/datasets/{push_result.repo_id}" | |
except Exception as e: | |
st.error(f"Error uploading to Hugging Face Hub: {str(e)}") | |
raise | |