Spaces:
Sleeping
Sleeping
vocab corpus increased - 300000
Browse files- app.py +1 -1
- encoder.py +3 -1
app.py
CHANGED
@@ -13,7 +13,7 @@ class DecodeRequest(BaseModel):
|
|
13 |
tokens: str
|
14 |
|
15 |
# Initialize the tokenizer
|
16 |
-
tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=
|
17 |
|
18 |
app = FastAPI()
|
19 |
|
|
|
13 |
tokens: str
|
14 |
|
15 |
# Initialize the tokenizer
|
16 |
+
tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
|
17 |
|
18 |
app = FastAPI()
|
19 |
|
encoder.py
CHANGED
@@ -42,6 +42,7 @@ class BPEGujaratiTokenizer:
|
|
42 |
|
43 |
def train_bpe(self, corpus, max_vocab_size, sample_size=None):
|
44 |
self.vocab = {idx: bytes([idx]) for idx in range(256)}
|
|
|
45 |
if sample_size :
|
46 |
corpus = corpus[:sample_size]
|
47 |
num_merges = max_vocab_size - len(self.vocab)
|
@@ -66,6 +67,7 @@ class BPEGujaratiTokenizer:
|
|
66 |
print(f"After training: tokens length: {len(tokens)}")
|
67 |
print("After training: merges length: ", len(self.merges))
|
68 |
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
|
|
|
69 |
return self.vocab, self.merges
|
70 |
|
71 |
def encode(self, text):
|
@@ -88,7 +90,7 @@ class BPEGujaratiTokenizer:
|
|
88 |
import time
|
89 |
if __name__ == "__main__":
|
90 |
start_time = time.time()
|
91 |
-
tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=
|
92 |
end_time = time.time()
|
93 |
print(f"Time taken to train: {end_time - start_time} seconds")
|
94 |
print("--------------------------------")
|
|
|
42 |
|
43 |
def train_bpe(self, corpus, max_vocab_size, sample_size=None):
|
44 |
self.vocab = {idx: bytes([idx]) for idx in range(256)}
|
45 |
+
print(f"Before training: vocab length: {len(self.vocab)}")
|
46 |
if sample_size :
|
47 |
corpus = corpus[:sample_size]
|
48 |
num_merges = max_vocab_size - len(self.vocab)
|
|
|
67 |
print(f"After training: tokens length: {len(tokens)}")
|
68 |
print("After training: merges length: ", len(self.merges))
|
69 |
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
|
70 |
+
print(f"After training: vocab length: {len(self.vocab)}")
|
71 |
return self.vocab, self.merges
|
72 |
|
73 |
def encode(self, text):
|
|
|
90 |
import time
|
91 |
if __name__ == "__main__":
|
92 |
start_time = time.time()
|
93 |
+
tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
|
94 |
end_time = time.time()
|
95 |
print(f"Time taken to train: {end_time - start_time} seconds")
|
96 |
print("--------------------------------")
|