Spaces:
Sleeping
Sleeping
| from torch.utils.data import Dataset | |
| class TextDataset(Dataset): | |
| def __init__(self, df): | |
| self.texts = df['text'].tolist() | |
| self.labels = df['label'].tolist() | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| text = self.texts[idx] | |
| label = self.labels[idx] | |
| return text, label | |
| if __name__ == "__main__": | |
| import pandas as pd | |
| import torch | |
| splits = {'train': 'train.jsonl', 'test': 'test.jsonl'} | |
| df_train = pd.read_json("hf://datasets/AlexSham/Toxic_Russian_Comments/" + splits["train"], lines=True) | |
| df_test = pd.read_json("hf://datasets/AlexSham/Toxic_Russian_Comments/" + splits["test"], lines=True) | |
| dataset_train = TextDataset(df_train) | |
| dataset_test = TextDataset(df_test) | |
| torch.save(dataset_train, 'data/dataset_train.pt') | |
| torch.save(dataset_test, 'data/dataset_test.pt') |