Asakrg
/

hausa-fine-tune-facebook-mms

Asakrg commited on Feb 5

Commit

3c14fc0

verified ·

1 Parent(s): f86be34

Upload create_vocab.py with huggingface_hub

Files changed (1) hide show

create_vocab.py ADDED Viewed

+#!/usr/bin/env python3
+import os
+import json
+folder_path = "./vocabs"
+all_dict = {}
+def parse_file(filename):
+    dictionary = {
+        "</s>": 2,
+        "<pad>": 0,
+        "<s>": 1,
+        "<unk>": 3,
+    }
+    value = 4
+    with open(filename, 'r') as file:
+        for line in file:
+            line = line.strip().split()
+            if line:
+                key = line[0]
+                dictionary[key] = value
+                value += 1
+    return dictionary
+for filename in os.listdir(folder_path):
+    filepath = os.path.join(folder_path, filename)
+    lang = filename.split(".")[0]
+    if os.path.isfile(filepath):
+        all_dict[lang] = parse_file(filepath)
+output_path = "vocab.json"  # Replace "output.json" with the desired output file path
+with open(output_path, 'w') as output_file:
+    json.dump(all_dict, output_file, indent=4, sort_keys=True)