In [2]:

def read_corpus(corpus_path:str):
 with open(corpus_path, 'r', encoding='utf-8') as f:
 text = f.read()
 return text



In [3]:


class BPEGujaratiTokenizer:
 def __init__(self, corpus_path:str, max_vocab_size:int=5001, sample_size:int=50000):
 self.corpus = read_corpus(corpus_path)
 self.max_vocab_size = max_vocab_size
 self.corpus_vocab = sorted(list(set(self.corpus)))
 self.corpus_vocab_size = len(self.corpus_vocab)
 self.stoi = { ch:i for i,ch in enumerate(self.corpus_vocab) }
 self.itos = { i:ch for i,ch in enumerate(self.corpus_vocab) }
 self.sample_size = sample_size

 self.vocab, self.merges = self.train_bpe(self.corpus, self.max_vocab_size, self.sample_size)


 def get_stats(self, ids):
 counts = {}
 for pair in zip(ids, ids[1:]):
 counts[pair] = counts.get(pair, 0) + 1
 return counts


 def merge(self,ids, pair, idx):
 newids = []
 i = 0
 while i < len(ids):
 if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
 newids.append(idx)
 i += 2
 else:
 newids.append(ids[i])
 i += 1
 return newids



 def train_bpe(self, corpus, max_vocab_size, sample_size=None):
 self.vocab = {idx: bytes([idx]) for idx in range(256)}
 print(f"Before Training Vocab length {len(self.vocab)}")
 if sample_size :
 corpus = corpus[:sample_size]
 num_merges = max_vocab_size - len(self.vocab)
 print(f"num_merges required {num_merges}")
 tokens = corpus.encode('utf-8')
 tokens= list(map(int, tokens))
 ids = list(tokens)
 self.merges = {} # (int, int) -> int
 print(f"Before training: ids length: {len(ids)}")
 print(f"Before training: tokens length: {len(tokens)}")
 print("Before training: merges length: ", len(self.merges))

 for i in range(num_merges):
 stats = self.get_stats(ids)
 pair = max(stats, key=stats.get)
 idx = len(self.vocab)+i
 ids = self.merge(ids, pair, idx)
 self.merges[pair] = idx
 # merge the vocab
 
 for (p0, p1), idx in self.merges.items():
 self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
 print(f"After training: ids length: {len(ids)}")
 print(f"After training: tokens length: {len(tokens)}")
 print("After training: merges length: ", len(self.merges))
 print(f"After Training Vocab length {len(self.vocab)}")
 print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
 return self.vocab, self.merges

 def encode(self, text):
 tokens = list(text.encode("utf-8"))
 while len(tokens) >= 2:
 stats = self.get_stats(tokens)
 pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
 if pair not in self.merges:
 break # nothing else can be merged
 idx = self.merges[pair]
 tokens = self.merge(tokens, pair, idx)
 return tokens

 
 def decode(self, tokens):
 tokens = b"".join(self.vocab[idx] for idx in tokens)
 text = tokens.decode("utf-8", errors="replace")
 return text
 


In [6]:
import time

start_time = time.time()
tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
end_time = time.time()
print(f"Time taken to train: {end_time - start_time} seconds")
print("--------------------------------")


Before Training Vocab length 256
num_merges required 4744
Before training: ids length: 755940
Before training: tokens length: 755940
Before training: merges length: 0
After training: ids length: 76306
After training: tokens length: 755940
After training: merges length: 4744
After Training Vocab length 5000
compression ratio: 9.91X
Time taken to train: 199.02717900276184 seconds
--------------------------------


In [5]:
start_time = time.time()
print(tokenizer.encode("હું તને પ્રેમ કરું છું"))
end_time = time.time()
print(f"Time taken to encode: {end_time - start_time} seconds")
print("--------------------------------")
start_time = time.time()
print(tokenizer.decode(tokenizer.encode("હું તને પ્રેમ કરું છું")))
end_time = time.time()
print(f"Time taken to decode: {end_time - start_time} seconds")
print("--------------------------------")
start_time = time.time()
sentences = ["હું આજે ખૂબ ખુશ છું.","તું શું કરે છે? ","મને ચા પીવી છે. ","એ બધું સરસ છે. ","આ પુસ્તક ખૂબ રસપ્રદ છે. ","તારે ક્યારે આવવું છે? ","આ મારો મિત્ર છે. ","હું શાકભાજી લઈ આવ્યો છું. ","આકાશ માં વાદળ છે. ","શાળા ક્યારે શરૂ થશે? ",'આ પુસ્તક ખૂબ રસપ્રદ છે.']
for sentence in sentences:
 print("original: ", sentence)
 print("encoded: ", tokenizer.encode(sentence))
 print("decoded: ", tokenizer.decode(tokenizer.encode(sentence)))
 print(tokenizer.decode(tokenizer.encode(sentence)) == sentence)
end_time = time.time()
print(f"Time taken to decode: {end_time - start_time} seconds")
print("--------------------------------") 

[294, 307, 164, 292, 431, 325, 317, 3229, 444]
Time taken to encode: 0.0007619857788085938 seconds
--------------------------------
હું તને પ્રેમ કરું છું
Time taken to decode: 0.0004019737243652344 seconds
--------------------------------
original: હું આજે ખૂબ ખુશ છું.
encoded: [294, 307, 1414, 853, 928, 1793, 482, 444, 46]
decoded: હું આજે ખૂબ ખુશ છું.
True
original: તું શું કરે છે? 
encoded: [3519, 182, 307, 391, 4339, 63, 32]
decoded: તું શું કરે છે? 
True
original: મને ચા પીવી છે. 
encoded: [274, 292, 154, 758, 519, 269, 296, 46, 32]
decoded: મને ચા પીવી છે. 
True
original: એ બધું સરસ છે. 
encoded: [512, 4222, 3997, 2296, 3648, 46, 32]
decoded: એ બધું સરસ છે. 
True
original: આ પુસ્તક ખૂબ રસપ્રદ છે. 
encoded: [256, 4844, 2469, 290, 3227, 311, 4738, 345, 3648, 46, 32]
decoded: આ પુસ્તક ખૂબ રસપ્રદ છે. 
True
original: તારે ક્યારે આવવું છે? 
encoded: [2460, 335, 484, 340, 793, 296, 63, 32]
decoded: તારે ક્યારે આવવું છે? 
True
original: આ મારો મિત્ર છે. 
encoded: [256, 134, 309, 763, 40