crpatel commited on
Commit
05d75b4
·
1 Parent(s): 56a0cfd

vocab corpus increased - 300000

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. encoder.py +3 -1
app.py CHANGED
@@ -13,7 +13,7 @@ class DecodeRequest(BaseModel):
13
  tokens: str
14
 
15
  # Initialize the tokenizer
16
- tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=50000)
17
 
18
  app = FastAPI()
19
 
 
13
  tokens: str
14
 
15
  # Initialize the tokenizer
16
+ tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
17
 
18
  app = FastAPI()
19
 
encoder.py CHANGED
@@ -42,6 +42,7 @@ class BPEGujaratiTokenizer:
42
 
43
  def train_bpe(self, corpus, max_vocab_size, sample_size=None):
44
  self.vocab = {idx: bytes([idx]) for idx in range(256)}
 
45
  if sample_size :
46
  corpus = corpus[:sample_size]
47
  num_merges = max_vocab_size - len(self.vocab)
@@ -66,6 +67,7 @@ class BPEGujaratiTokenizer:
66
  print(f"After training: tokens length: {len(tokens)}")
67
  print("After training: merges length: ", len(self.merges))
68
  print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
 
69
  return self.vocab, self.merges
70
 
71
  def encode(self, text):
@@ -88,7 +90,7 @@ class BPEGujaratiTokenizer:
88
  import time
89
  if __name__ == "__main__":
90
  start_time = time.time()
91
- tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
92
  end_time = time.time()
93
  print(f"Time taken to train: {end_time - start_time} seconds")
94
  print("--------------------------------")
 
42
 
43
  def train_bpe(self, corpus, max_vocab_size, sample_size=None):
44
  self.vocab = {idx: bytes([idx]) for idx in range(256)}
45
+ print(f"Before training: vocab length: {len(self.vocab)}")
46
  if sample_size :
47
  corpus = corpus[:sample_size]
48
  num_merges = max_vocab_size - len(self.vocab)
 
67
  print(f"After training: tokens length: {len(tokens)}")
68
  print("After training: merges length: ", len(self.merges))
69
  print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
70
+ print(f"After training: vocab length: {len(self.vocab)}")
71
  return self.vocab, self.merges
72
 
73
  def encode(self, text):
 
90
  import time
91
  if __name__ == "__main__":
92
  start_time = time.time()
93
+ tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
94
  end_time = time.time()
95
  print(f"Time taken to train: {end_time - start_time} seconds")
96
  print("--------------------------------")