Spaces:

Allanatrix
/

Nexa_Data_Studio

Running

App Files Files Community

Nexa_Data_Studio / Tokenization /hf_upload.py

Allanatrix

Upload 50 files

ef4c8c3 verified 6 days ago

raw

history blame contribute delete

6.21 kB

	import logging
	import os
	import sys
	from datetime import datetime
	from pathlib import Path

	from datasets import Dataset, Features, Value
	from dotenv import load_dotenv
	from huggingface_hub import HfApi

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")

	# Logging setup
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler('debug_upload.log', mode='w')
	]
	)

	REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
	JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
	ARROW_PATH = Path("scientific_corpus_325M.arrow")
	README_PATH = Path("README.md")

	def debug_jsonl_head(jsonl_path, n=5):
	logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
	try:
	with open(jsonl_path, "r", encoding="utf-8") as f:
	for i in range(n):
	line = f.readline()
	if not line:
	break
	logging.info(f"Line {i+1}: {line.strip()}")
	except Exception as e:
	logging.error(f"Failed to read JSONL head: {e}")

	def infer_features_from_sample(jsonl_path, n=100):
	import json
	from collections import defaultdict
	types = defaultdict(set)
	try:
	with open(jsonl_path, "r", encoding="utf-8") as f:
	for i, line in enumerate(f):
	if i >= n:
	break
	obj = json.loads(line)
	for k, v in obj.items():
	types[k].add(type(v).__name__)
	logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
	except Exception as e:
	logging.error(f"Failed to infer features: {e}")

	def convert_jsonl_to_arrow(jsonl_path, arrow_path):
	try:
	logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
	if not jsonl_path.exists():
	logging.error(f"JSONL source file does not exist: {jsonl_path}")
	print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
	raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
	logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
	debug_jsonl_head(jsonl_path, n=5)
	infer_features_from_sample(jsonl_path, n=100)
	# Try loading a small sample first for debugging
	try:
	sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
	logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
	except Exception as sample_e:
	logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
	print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
	# Try to load with explicit features if possible
	# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
	# Uncomment and adjust the following lines if you know the schema:
	# features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
	# try:
	# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
	# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
	# except Exception as e2:
	# logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
	raise
	# Now load the full dataset
	dataset = Dataset.from_json(str(jsonl_path))
	logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
	dataset.to_file(str(arrow_path))
	logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
	return dataset
	except Exception as e:
	logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
	print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
	raise

	def create_readme(dataset):
	content = f"""# Scientific Research Tokenized Dataset

	- Examples: {len(dataset):,}
	- Columns: {dataset.column_names}
	- Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

	## Usage
	```python
	from datasets import load_dataset
	ds = load_dataset("{REPO_ID}")
	```
	"""
	with open(README_PATH, "w", encoding="utf-8") as f:
	f.write(content)
	logging.info("README.md created.")

	def upload_to_hf():
	api = HfApi()
	logging.info("Uploading Arrow file to HuggingFace Hub ...")
	api.upload_file(
	path_or_fileobj=str(ARROW_PATH),
	path_in_repo=ARROW_PATH.name,
	repo_id=REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message="Upload Arrow dataset"
	)
	logging.info("Uploading README.md to HuggingFace Hub ...")
	api.upload_file(
	path_or_fileobj=str(README_PATH),
	path_in_repo="README.md",
	repo_id=REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message="Update README"
	)
	logging.info("Upload complete.")

	def upload_to_huggingface(args, *kwargs):
	"""Alias for upload_to_hf to match expected import in Main_2.py"""
	return upload_to_hf(args, *kwargs)

	def cleanup():
	if ARROW_PATH.exists():
	ARROW_PATH.unlink()
	if README_PATH.exists():
	README_PATH.unlink()
	logging.info("Cleaned up local files.")

	def main():
	try:
	if not HF_TOKEN:
	print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
	return
	dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
	create_readme(dataset)
	upload_to_hf()
	print(f"\n🎉 SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
	except Exception as e:
	logging.error(f"Process failed: {e}")
	print(f"\n❌ Upload failed. See debug_upload.log for details.")
	sys.exit(1)
	finally:
	cleanup()

	if __name__ == "__main__":
	main()