Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	vocab corpus increased - 300000
Browse files- app.py +1 -1
- encoder.py +3 -1
    	
        app.py
    CHANGED
    
    | @@ -13,7 +13,7 @@ class DecodeRequest(BaseModel): | |
| 13 | 
             
                tokens: str
         | 
| 14 |  | 
| 15 | 
             
            # Initialize the tokenizer
         | 
| 16 | 
            -
            tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size= | 
| 17 |  | 
| 18 | 
             
            app = FastAPI()
         | 
| 19 |  | 
|  | |
| 13 | 
             
                tokens: str
         | 
| 14 |  | 
| 15 | 
             
            # Initialize the tokenizer
         | 
| 16 | 
            +
            tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
         | 
| 17 |  | 
| 18 | 
             
            app = FastAPI()
         | 
| 19 |  | 
    	
        encoder.py
    CHANGED
    
    | @@ -42,6 +42,7 @@ class BPEGujaratiTokenizer: | |
| 42 |  | 
| 43 | 
             
                def train_bpe(self, corpus, max_vocab_size, sample_size=None):
         | 
| 44 | 
             
                    self.vocab = {idx: bytes([idx]) for idx in range(256)}
         | 
|  | |
| 45 | 
             
                    if sample_size :
         | 
| 46 | 
             
                        corpus = corpus[:sample_size]
         | 
| 47 | 
             
                    num_merges = max_vocab_size - len(self.vocab)
         | 
| @@ -66,6 +67,7 @@ class BPEGujaratiTokenizer: | |
| 66 | 
             
                    print(f"After training: tokens length: {len(tokens)}")
         | 
| 67 | 
             
                    print("After training: merges length: ", len(self.merges))
         | 
| 68 | 
             
                    print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
         | 
|  | |
| 69 | 
             
                    return self.vocab, self.merges
         | 
| 70 |  | 
| 71 | 
             
                def encode(self, text):
         | 
| @@ -88,7 +90,7 @@ class BPEGujaratiTokenizer: | |
| 88 | 
             
            import time
         | 
| 89 | 
             
            if __name__ == "__main__":
         | 
| 90 | 
             
                start_time = time.time()
         | 
| 91 | 
            -
                tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size= | 
| 92 | 
             
                end_time = time.time()
         | 
| 93 | 
             
                print(f"Time taken to train: {end_time - start_time} seconds")
         | 
| 94 | 
             
                print("--------------------------------")
         | 
|  | |
| 42 |  | 
| 43 | 
             
                def train_bpe(self, corpus, max_vocab_size, sample_size=None):
         | 
| 44 | 
             
                    self.vocab = {idx: bytes([idx]) for idx in range(256)}
         | 
| 45 | 
            +
                    print(f"Before training: vocab length: {len(self.vocab)}")
         | 
| 46 | 
             
                    if sample_size :
         | 
| 47 | 
             
                        corpus = corpus[:sample_size]
         | 
| 48 | 
             
                    num_merges = max_vocab_size - len(self.vocab)
         | 
|  | |
| 67 | 
             
                    print(f"After training: tokens length: {len(tokens)}")
         | 
| 68 | 
             
                    print("After training: merges length: ", len(self.merges))
         | 
| 69 | 
             
                    print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
         | 
| 70 | 
            +
                    print(f"After training: vocab length: {len(self.vocab)}")
         | 
| 71 | 
             
                    return self.vocab, self.merges
         | 
| 72 |  | 
| 73 | 
             
                def encode(self, text):
         | 
|  | |
| 90 | 
             
            import time
         | 
| 91 | 
             
            if __name__ == "__main__":
         | 
| 92 | 
             
                start_time = time.time()
         | 
| 93 | 
            +
                tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=300000)
         | 
| 94 | 
             
                end_time = time.time()
         | 
| 95 | 
             
                print(f"Time taken to train: {end_time - start_time} seconds")
         | 
| 96 | 
             
                print("--------------------------------")
         |