Spaces:

LunaticMaestro
/

you_might_speak

Sleeping

App Files Files Community

Deepak Sahu commited on Jan 1

Commit

2c1ff7f

1 Parent(s): ac2255b

training; app

Browse files

Files changed (14) hide show

.devcontainer/devcontainer.json +16 -0
.gitattributes +1 -0
.vscode/launch.json +16 -0
README.md +17 -2
app.py +53 -0
model/evaluate.png +0 -0
model/label.json +20 -0
model/loss.png +0 -0
model/lowercase_evaluate.png +0 -0
model/lowercase_loss.png +0 -0
model/rnn.pth +3 -0
z_dataops.py +103 -0
z_inference.py +30 -0
z_modelops.py +155 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "name": "NLP Pytorch",
+    "image": "76e5e98ec29501e94739cafb6daa580774619fa92b6c4d71efade219a23b4b22"
+    ,
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-toolsai.jupyter",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-python.debugpy"
+            ]
+        }
+    }
+}

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 data/ filter=lfs diff=lfs merge=lfs -text
 *.txt filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 data/ filter=lfs diff=lfs merge=lfs -text
 *.txt filter=lfs diff=lfs merge=lfs -text
+.pth filter=lfs diff=lfs merge=lfs -text

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ pinned: false
 short_description: I guess you might speak <Language>
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 ## Data Source
@@ -23,4 +23,19 @@ Last Accessed: 30th Dec 2024
 The code is partially inspired by https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
 **Changes I Introduced**
-    - DataLoader is added

 short_description: I guess you might speak <Language>
 ---
+# Language Guesser based on Name
 ## Data Source
 The code is partially inspired by https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
 **Changes I Introduced**
+- NamesDataset is separated from transformation, useful for transformation during inference
+- target is made integer instead of one-hot encoding;
+- changed the loss from combination of LogSoftmax + NLLoss to CrossEntropy (EXACTLY THE SAME STUFF); which further required removing the softmax layer from the architecture.
+- DataLoader is added
+- Input made batch first > Corresponding RNN is also made batch first.
+## Evaluation
+Although the code is mostly replicated. However, I changed the dataloader to use apply lowercase transformation to data, and it confused the model.
+- Confusion matrix with **with lowercase** transformation
+    ![Click here to view the image](model/lowercase_evaluate.png)
+    ![Click here to view the image](model/lowercase_loss.png)
+- Confusion matrix **without lowercase** transformation
+    ![Click here to view the image](model/evaluate.png)
+    ![Click here to view the image](model/loss.png)

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+GRADIO_TITLE = "Langues Guesser based on Name"
+GRADIO_DESCRIPTION = '''
+This is a self-learning project which replicates the [pytorch tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html) with modifications.
+Kindly see [my GitHub: you may speak](https://github.com/LunaticMaestro/-NLP-_you_may_speak) readme to checkout the modifications.
+Model Trained for names on following languages: ['Korean 🇰🇷', 'Portuguese 🇵🇹', 'Dutch 🇳🇱', 'Italian 🇮🇹', 'German 🇩🇪', 'Scottish 🏴\U000e0067\U000e0062\U000e0073\U000e0063\U000e0074\U000e007f', 'Vietnamese 🇻🇳', 'French 🇫🇷', 'English 🇬🇧', 'Arabic 🇲🇦', 'Irish 🇮🇪', 'Chinese 🇨🇳', 'Japanese 🇯🇵', 'Russian 🇷🇺', 'Polish 🇵🇱', 'Czech 🇨🇿', 'Spanish 🇪🇸', 'Greek 🇬🇷']
+'''
+import gradio as gr
+from z_modelops import NameToLanguages
+from z_inference import setup_inference, infer_lang
+model, labels = setup_inference()
+def get_langauge(name):
+    langugages = infer_lang(name, model, labels)
+    language_flags = {
+        "Korean": "\U0001F1F0\U0001F1F7",    # South Korea
+        "Portuguese": "\U0001F1F5\U0001F1F9", # Portugal
+        "Dutch": "\U0001F1F3\U0001F1F1",      # Netherlands
+        "Italian": "\U0001F1EE\U0001F1F9",    # Italy
+        "German": "\U0001F1E9\U0001F1EA",     # Germany
+        "Scottish": "\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F", # Scotland (flag sequence)
+        "Vietnamese": "\U0001F1FB\U0001F1F3",  # Vietnam
+        "French": "\U0001F1EB\U0001F1F7",      # France
+        "English": "\U0001F1EC\U0001F1E7", # England (flag sequence)
+        "Arabic": "\U0001F1F2\U0001F1E6",      # UAE (commonly associated with Arabic)
+        "Irish": "\U0001F1EE\U0001F1EA",       # Ireland
+        "Chinese": "\U0001F1E8\U0001F1F3",     # China
+        "Japanese": "\U0001F1EF\U0001F1F5",    # Japan
+        "Russian": "\U0001F1F7\U0001F1FA",     # Russia
+        "Polish": "\U0001F1F5\U0001F1F1",      # Poland
+        "Czech": "\U0001F1E8\U0001F1FF",       # Czech Republic
+        "Spanish": "\U0001F1EA\U0001F1F8",     # Spain
+        "Greek": "\U0001F1EC\U0001F1F7"         # Greece
+    }
+    return '\n'.join([lang + " " + language_flags[lang] for lang in langugages])
+input_textbox = gr.Textbox(label="Your Name", placeholder="Naifeh", max_lines=1)
+demo = gr.Interface(
+    fn=get_langauge,
+    inputs=input_textbox ,
+    outputs=gr.Label(label="You may speak"),
+    title=GRADIO_TITLE,
+    description=GRADIO_DESCRIPTION
+)
+demo.launch(debug=True)

model/evaluate.png ADDED Viewed

model/label.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "0": "Korean",
+    "1": "Portuguese",
+    "2": "Dutch",
+    "3": "Italian",
+    "4": "German",
+    "5": "Scottish",
+    "6": "Vietnamese",
+    "7": "French",
+    "8": "English",
+    "9": "Arabic",
+    "10": "Irish",
+    "11": "Chinese",
+    "12": "Japanese",
+    "13": "Russian",
+    "14": "Polish",
+    "15": "Czech",
+    "16": "Spanish",
+    "17": "Greek"
+}

model/loss.png ADDED Viewed

model/lowercase_evaluate.png ADDED Viewed

model/lowercase_loss.png ADDED Viewed

model/rnn.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f401906eced2fa8d49e39126687542cb871b1b7112d34123f993139074e68b9f
+size 106016

z_dataops.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import json
+from typing import List
+import torch
+import unicodedata
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import transforms, Lambda
+import glob
+import os
+import string
+### Dataset
+class NamesDataset(Dataset):
+    '''Loads names from different languages. Store the names in runtime and DOES NOT do lazy loading.
+    '''
+    def __init__(self, data_dir: str="data/names", transform=None):
+        super().__init__()
+        # track object variables
+        self.data_dir = data_dir
+        self.transform = transform
+        # generated variables
+        self.names = []
+        self.labels = []
+        self.classes_to_idx: dict = []
+        self.idx_to_classes: dict = []
+        # locate all languages names file .txt
+        self.read_data_files()
+        self.set_classes()
+    def read_data_files(self):
+        '''locates files with .txt pattern and reads them, output stored in self.names, labels'''
+        files: List[str] = glob.glob(os.path.join(self.data_dir, "*.txt"))
+        for file in files:
+            language: str = os.path.splitext(os.path.basename(file))[0]
+            # Read File contents
+            with open(file, "r") as f:
+                contents = f.read()
+                names = contents.split("\n")
+            # Store data
+            self.names.extend(names)
+            self.labels.extend([language for _ in range(len(names))])
+        return None
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, index):
+        name = self.names[index]
+        label = self.labels[index]
+        if self.transform:
+            name = self.transform(name)
+        # label: torch.Tensor = torch.zeros((len(self.classes_to_idx)), dtype=torch.float).scatter_(dim=0, index=torch.tensor(self.classes_to_idx.get(label)), value=1)
+        label = torch.tensor([self.classes_to_idx.get(label)])
+        return name.unsqueeze(0), label
+    def set_classes(self, cache_location:str = "model/label.json"):
+        '''takes the unique labels and store in self.classes'''
+        # first saves the labels to file so it can be used during inferencing
+        unique_labels = list(set(self.labels))
+        self.classes_to_idx = dict([(label, i) for i, label in enumerate(unique_labels)])
+        self.idx_to_classes = {value: key for key, value in self.classes_to_idx.items()}
+        with open(cache_location, "w") as file:
+            json.dump(self.idx_to_classes, file, indent=4)
+### Transformations
+## **Why**: So that they can be applied separately during inference
+def _allowed_characters(s: str):
+    allowed_characters = string.ascii_letters
+    return ''.join([char if allowed_characters.find(char) >= 0 else '' for char in s])
+def _unicode_to_ascii(s:str):
+    '''Converts Unicode to ASCII to normalize ACCENTS'''
+    # CODE from https://stackoverflow.com/a/518232/2809427
+    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
+def _string_to_Tensor(name: str):
+    '''Converts to dimensionality (chars, LowerCaseAscii)'''
+    name_lower = name
+    name_tensor = torch.zeros((len(name_lower), len(string.ascii_letters))).scatter_(dim=1, index= torch.tensor(list(map(string.ascii_letters.index, name_lower))).unsqueeze(1), value=1)
+    return name_tensor
+transform = transforms.Compose([
+    _unicode_to_ascii,
+    _allowed_characters,
+    _string_to_Tensor,
+])
+def proxy_collate_batch(batch: List)-> List[torch.Tensor]:
+    '''Although we are not padding the sequence we created this proxy function to avoid stacking the jagged array.'''
+    batch = [(x, y) for x, y in batch if x.shape[1] > 1]
+    return batch
+if __name__ == "__main__":
+    ds = NamesDataset(transform=transform)
+    train_dataset = DataLoader(ds, batch_size=64, shuffle=True, collate_fn=proxy_collate_batch)
+    batch = next(iter(train_dataset))
+    print(batch[0][0].shape, batch[0][1].shape) # (1, x, 26), # (1)

z_inference.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from z_modelops import NameToLanguages, load_labels
+from z_dataops import transform
+import json
+from torch import nn
+def load_model(location="model/rnn.pth"):
+    '''loads the model, together with arch'''
+    model = torch.load(location, weights_only=False)
+    return model
+def infer_lang(name:str, model, label:dict, k=3)-> str:
+    name_tensor = transform(name)
+    with torch.no_grad():
+        logits = model(name_tensor.unsqueeze(0))
+        y_pred = nn.Softmax(dim=1)(logits)
+    top_k_idx = y_pred.sort(descending=True, dim=1).indices.numpy()[0][:k]
+    return [label[str(idx)] for idx in top_k_idx]
+def setup_inference():
+    # load model
+    model = load_model()
+    # call the model with inputs
+    labels = load_labels()
+    return model, labels
+if __name__=="__main__":
+    model, labels = setup_inference()

z_modelops.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import numpy as np
+from torch import nn
+import torch
+from torch.utils.data import random_split, DataLoader
+from z_dataops import NamesDataset, transform, proxy_collate_batch
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import string
+class NameToLanguages(nn.Module):
+    def __init__(self, feature_size=26, n_classes=18):
+        super().__init__()
+        # create simple architecture
+        self.net_rnn = nn.RNN(input_size=feature_size, hidden_size=128, batch_first=True)
+        self.net_linear = nn.Linear(in_features=128, out_features=n_classes)
+    def forward(self, x):
+        rnn_out, last_ts = self.net_rnn(x)
+        output = self.net_linear(last_ts[0])
+        return output
+def training(model: nn.Module, train_batch: list, optimizer, loss_fn):
+    model.train()
+    batch_loss = 0
+    for x, y in train_batch:
+        # predict
+        y_pred = model(x)
+        # compute loss
+        curr_loss = loss_fn(y_pred, y)
+        batch_loss += curr_loss
+    # reset grad
+    optimizer.zero_grad()
+    # calculate grad
+    batch_loss.backward()
+    # nn.utils.clip_grad_norm_(model.parameters(), 3)
+    # step
+    optimizer.step()
+    return batch_loss.item() / len(train_batch)
+def validation(model, dl: DataLoader, loss_fn):
+    model.eval()
+    batch_loss = 0
+    with torch.no_grad():
+        for item in dl:
+            for x, y in item:
+                # predict
+                y_pred = model(x)
+                # loss
+                curr_loss = loss_fn(y_pred, y)
+                batch_loss += curr_loss
+    return batch_loss.item() / len(dl)
+def plot_losses(loss_label, title, save_location="model/loss.png"):
+    for k, v in loss_label.items():
+        plt.plot(v, label=k)
+    plt.legend()
+    plt.title(title)
+    plt.savefig(save_location)
+def load_labels(input_file="model/label.json"):
+    # Read the dictionary from the file
+    with open(input_file, 'r') as file:
+        dictionary = json.load(file)
+    return dictionary
+def evaluate(rnn, validation_dl, classes):
+    # CODE AS IS FROM: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#evaluating-the-results
+    confusion = torch.zeros(len(classes), len(classes))
+    rnn.eval() #set to eval mode
+    with torch.no_grad(): # do not record the gradients during eval phase
+        for item in validation_dl:
+            for text_tensor, label in item:
+                output = rnn(text_tensor)
+                #
+                _, idx = output.topk(1)
+                guess, guess_i = classes[str(idx.item())], idx.item()
+                label_i = label.item()
+                confusion[label_i][guess_i] += 1
+    # Normalize by dividing every row by its sum
+    for i in range(len(classes)):
+        denom = confusion[i].sum()
+        if denom > 0:
+            confusion[i] = confusion[i] / denom
+    # Set up plot
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(confusion.cpu().numpy()) #numpy uses cpu here so we need to use a cpu version
+    fig.colorbar(cax)
+    tag = [classes[str(i)] for i in range(len(classes))]
+    # Set up axes
+    ax.set_xticks(np.arange(len(classes)), labels=tag, rotation=90)
+    ax.set_yticks(np.arange(len(classes)), labels=tag)
+    # Force label at every tick
+    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
+    # sphinx_gallery_thumbnail_number = 2
+    plt.savefig("model/evaluate.png")
+def load_labels(input_file="model/label.json"):
+    # Read the dictionary from the file
+    with open(input_file, 'r') as file:
+        dictionary = json.load(file)
+    return dictionary
+if __name__=="__main__":
+    model = NameToLanguages(feature_size=len(string.ascii_letters))
+    # #Sanity Check Model
+    # x = torch.randn((1, 7, 26)) # (batch, word_length, one-hot-ascii-char)
+    # model.eval()
+    # with torch.no_grad():
+    #     out = model(x)
+    #     print(out.shape)
+    # #Optimziers, Loss
+    optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-3)
+    loss_fn = nn.CrossEntropyLoss()
+    n_epoch = 27
+    # #Training Loop
+    ds = NamesDataset(transform=transform)
+    train_ds, val_ds = random_split(ds, [0.7, 0.3], generator=torch.Generator().manual_seed(31))
+    train_dl    = DataLoader(dataset=train_ds, batch_size=64, collate_fn=proxy_collate_batch)
+    val_dl      = DataLoader(dataset=val_ds, collate_fn=proxy_collate_batch)
+    # #Trackers
+    train_losses, val_losses = [], []
+    for epoch in range(n_epoch):
+        for batch in train_dl:
+            train_loss = training(model, batch, optimizer, loss_fn)
+            # report val loss
+        train_losses.append(train_loss)
+        val_loss = validation(model, val_dl, loss_fn)
+        val_losses.append(val_loss)
+        print(f"Epoch {epoch}: Train_loss: {train_losses[-1]}, Val_loss: {val_loss}")
+    plot_losses({"train": train_losses, "val": val_losses}, "Training Loss")
+    torch.save(model, "model/rnn.pth")
+    classes = load_labels()
+    evaluate(model, val_dl, classes)