Spaces:
Running
Running
| from datasets import load_dataset | |
| import re | |
| def arxiv_remove_version_suffix(arxiv_id): | |
| if arxiv_id is None: | |
| return None | |
| # Ensure arxiv_id is a string before applying regex | |
| elif isinstance(arxiv_id, str): | |
| cleaned_id = re.sub(r'v\d+$', '', arxiv_id) | |
| return cleaned_id | |
| else: | |
| # Handle unexpected types | |
| return arxiv_id | |
| # Load datasets | |
| def load_and_process(dataset_name): | |
| data = load_dataset(dataset_name, split="train").to_pandas() | |
| if 'arxiv_id' in data.columns: | |
| data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix) | |
| return data | |