Safetensors
wav2vec2
mms
Asakrg commited on
Commit
3c14fc0
·
verified ·
1 Parent(s): f86be34

Upload create_vocab.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. create_vocab.py +37 -0
create_vocab.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ folder_path = "./vocabs"
5
+
6
+ all_dict = {}
7
+
8
+ def parse_file(filename):
9
+ dictionary = {
10
+ "</s>": 2,
11
+ "<pad>": 0,
12
+ "<s>": 1,
13
+ "<unk>": 3,
14
+ }
15
+ value = 4
16
+
17
+ with open(filename, 'r') as file:
18
+ for line in file:
19
+ line = line.strip().split()
20
+ if line:
21
+ key = line[0]
22
+ dictionary[key] = value
23
+ value += 1
24
+
25
+ return dictionary
26
+
27
+ for filename in os.listdir(folder_path):
28
+ filepath = os.path.join(folder_path, filename)
29
+ lang = filename.split(".")[0]
30
+ if os.path.isfile(filepath):
31
+ all_dict[lang] = parse_file(filepath)
32
+
33
+
34
+ output_path = "vocab.json" # Replace "output.json" with the desired output file path
35
+
36
+ with open(output_path, 'w') as output_file:
37
+ json.dump(all_dict, output_file, indent=4, sort_keys=True)