Spaces:
Running
Running
import logging | |
import os | |
import sys | |
from datetime import datetime | |
from pathlib import Path | |
from datasets import Dataset, Features, Value | |
from dotenv import load_dotenv | |
from huggingface_hub import HfApi | |
# Load environment variables | |
load_dotenv() | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
# Logging setup | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.StreamHandler(sys.stdout), | |
logging.FileHandler('debug_upload.log', mode='w') | |
] | |
) | |
REPO_ID = "Allanatrix/Scientific_Research_Tokenized" | |
JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl") | |
ARROW_PATH = Path("scientific_corpus_325M.arrow") | |
README_PATH = Path("README.md") | |
def debug_jsonl_head(jsonl_path, n=5): | |
logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:") | |
try: | |
with open(jsonl_path, "r", encoding="utf-8") as f: | |
for i in range(n): | |
line = f.readline() | |
if not line: | |
break | |
logging.info(f"Line {i+1}: {line.strip()}") | |
except Exception as e: | |
logging.error(f"Failed to read JSONL head: {e}") | |
def infer_features_from_sample(jsonl_path, n=100): | |
import json | |
from collections import defaultdict | |
types = defaultdict(set) | |
try: | |
with open(jsonl_path, "r", encoding="utf-8") as f: | |
for i, line in enumerate(f): | |
if i >= n: | |
break | |
obj = json.loads(line) | |
for k, v in obj.items(): | |
types[k].add(type(v).__name__) | |
logging.info(f"Inferred field types from first {n} lines: {dict(types)}") | |
except Exception as e: | |
logging.error(f"Failed to infer features: {e}") | |
def convert_jsonl_to_arrow(jsonl_path, arrow_path): | |
try: | |
logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...") | |
if not jsonl_path.exists(): | |
logging.error(f"JSONL source file does not exist: {jsonl_path}") | |
print(f"\nβ JSONL source file does not exist: {jsonl_path}") | |
raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}") | |
logging.info(f"File size: {jsonl_path.stat().st_size} bytes") | |
debug_jsonl_head(jsonl_path, n=5) | |
infer_features_from_sample(jsonl_path, n=100) | |
# Try loading a small sample first for debugging | |
try: | |
sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]") | |
logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") | |
except Exception as sample_e: | |
logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True) | |
print(f"\nβ Failed to load sample from JSONL. See debug_upload.log for details.") | |
# Try to load with explicit features if possible | |
# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) | |
# Uncomment and adjust the following lines if you know the schema: | |
# features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) | |
# try: | |
# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features) | |
# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") | |
# except Exception as e2: | |
# logging.error(f"Still failed with explicit features: {e2}", exc_info=True) | |
raise | |
# Now load the full dataset | |
dataset = Dataset.from_json(str(jsonl_path)) | |
logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}") | |
dataset.to_file(str(arrow_path)) | |
logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.") | |
return dataset | |
except Exception as e: | |
logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True) | |
print(f"\nβ Failed to convert JSONL to Arrow. See debug_upload.log for details.") | |
raise | |
def create_readme(dataset): | |
content = f"""# Scientific Research Tokenized Dataset | |
- **Examples**: {len(dataset):,} | |
- **Columns**: {dataset.column_names} | |
- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
## Usage | |
```python | |
from datasets import load_dataset | |
ds = load_dataset("{REPO_ID}") | |
``` | |
""" | |
with open(README_PATH, "w", encoding="utf-8") as f: | |
f.write(content) | |
logging.info("README.md created.") | |
def upload_to_hf(): | |
api = HfApi() | |
logging.info("Uploading Arrow file to HuggingFace Hub ...") | |
api.upload_file( | |
path_or_fileobj=str(ARROW_PATH), | |
path_in_repo=ARROW_PATH.name, | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
token=HF_TOKEN, | |
commit_message="Upload Arrow dataset" | |
) | |
logging.info("Uploading README.md to HuggingFace Hub ...") | |
api.upload_file( | |
path_or_fileobj=str(README_PATH), | |
path_in_repo="README.md", | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
token=HF_TOKEN, | |
commit_message="Update README" | |
) | |
logging.info("Upload complete.") | |
def upload_to_huggingface(*args, **kwargs): | |
"""Alias for upload_to_hf to match expected import in Main_2.py""" | |
return upload_to_hf(*args, **kwargs) | |
def cleanup(): | |
if ARROW_PATH.exists(): | |
ARROW_PATH.unlink() | |
if README_PATH.exists(): | |
README_PATH.unlink() | |
logging.info("Cleaned up local files.") | |
def main(): | |
try: | |
if not HF_TOKEN: | |
print("β HF_TOKEN not found in environment. Please set it in your .env file.") | |
return | |
dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH) | |
create_readme(dataset) | |
upload_to_hf() | |
print(f"\nπ SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}") | |
except Exception as e: | |
logging.error(f"Process failed: {e}") | |
print(f"\nβ Upload failed. See debug_upload.log for details.") | |
sys.exit(1) | |
finally: | |
cleanup() | |
if __name__ == "__main__": | |
main() |