Spaces:
Sleeping
Sleeping
import json | |
from typing import List | |
import torch | |
import unicodedata | |
from torch.utils.data import DataLoader, Dataset | |
from torchvision.transforms import transforms, Lambda | |
import glob | |
import os | |
import string | |
### Dataset | |
class NamesDataset(Dataset): | |
'''Loads names from different languages. Store the names in runtime and DOES NOT do lazy loading. | |
''' | |
def __init__(self, data_dir: str="data/names", transform=None): | |
super().__init__() | |
# track object variables | |
self.data_dir = data_dir | |
self.transform = transform | |
# generated variables | |
self.names = [] | |
self.labels = [] | |
self.classes_to_idx: dict = [] | |
self.idx_to_classes: dict = [] | |
# locate all languages names file .txt | |
self.read_data_files() | |
self.set_classes() | |
def read_data_files(self): | |
'''locates files with .txt pattern and reads them, output stored in self.names, labels''' | |
files: List[str] = glob.glob(os.path.join(self.data_dir, "*.txt")) | |
for file in files: | |
language: str = os.path.splitext(os.path.basename(file))[0] | |
# Read File contents | |
with open(file, "r") as f: | |
contents = f.read() | |
names = contents.split("\n") | |
# Store data | |
self.names.extend(names) | |
self.labels.extend([language for _ in range(len(names))]) | |
return None | |
def __len__(self): | |
return len(self.labels) | |
def __getitem__(self, index): | |
name = self.names[index] | |
label = self.labels[index] | |
if self.transform: | |
name = self.transform(name) | |
# label: torch.Tensor = torch.zeros((len(self.classes_to_idx)), dtype=torch.float).scatter_(dim=0, index=torch.tensor(self.classes_to_idx.get(label)), value=1) | |
label = torch.tensor([self.classes_to_idx.get(label)]) | |
return name.unsqueeze(0), label | |
def set_classes(self, cache_location:str = "model/label.json"): | |
'''takes the unique labels and store in self.classes''' | |
# first saves the labels to file so it can be used during inferencing | |
unique_labels = list(set(self.labels)) | |
self.classes_to_idx = dict([(label, i) for i, label in enumerate(unique_labels)]) | |
self.idx_to_classes = {value: key for key, value in self.classes_to_idx.items()} | |
with open(cache_location, "w") as file: | |
json.dump(self.idx_to_classes, file, indent=4) | |
### Transformations | |
## **Why**: So that they can be applied separately during inference | |
def _allowed_characters(s: str): | |
allowed_characters = string.ascii_letters | |
return ''.join([char if allowed_characters.find(char) >= 0 else '' for char in s]) | |
def _unicode_to_ascii(s:str): | |
'''Converts Unicode to ASCII to normalize ACCENTS''' | |
# CODE from https://stackoverflow.com/a/518232/2809427 | |
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') | |
def _string_to_Tensor(name: str): | |
'''Converts to dimensionality (chars, LowerCaseAscii)''' | |
name_lower = name | |
name_tensor = torch.zeros((len(name_lower), len(string.ascii_letters))).scatter_(dim=1, index= torch.tensor(list(map(string.ascii_letters.index, name_lower))).unsqueeze(1), value=1) | |
return name_tensor | |
transform = transforms.Compose([ | |
_unicode_to_ascii, | |
_allowed_characters, | |
_string_to_Tensor, | |
]) | |
def proxy_collate_batch(batch: List)-> List[torch.Tensor]: | |
'''Although we are not padding the sequence we created this proxy function to avoid stacking the jagged array.''' | |
batch = [(x, y) for x, y in batch if x.shape[1] > 1] | |
return batch | |
if __name__ == "__main__": | |
ds = NamesDataset(transform=transform) | |
train_dataset = DataLoader(ds, batch_size=64, shuffle=True, collate_fn=proxy_collate_batch) | |
batch = next(iter(train_dataset)) | |
print(batch[0][0].shape, batch[0][1].shape) # (1, x, 26), # (1) |