crpatel commited on
Commit
af91c4e
·
1 Parent(s): 8846920

READ ME update

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. indi-lang.ipynb +278 -0
app.py CHANGED
@@ -13,7 +13,7 @@ class DecodeRequest(BaseModel):
13
  tokens: str
14
 
15
  # Initialize the tokenizer
16
- tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
17
 
18
  app = FastAPI()
19
 
 
13
  tokens: str
14
 
15
  # Initialize the tokenizer
16
+ tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=150000)
17
 
18
  app = FastAPI()
19
 
indi-lang.ipynb ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "\n",
10
+ "def read_corpus(corpus_path:str):\n",
11
+ " with open(corpus_path, 'r', encoding='utf-8') as f:\n",
12
+ " text = f.read()\n",
13
+ " return text\n",
14
+ "\n"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 3,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "\n",
24
+ "\n",
25
+ "class BPEGujaratiTokenizer:\n",
26
+ " def __init__(self, corpus_path:str, max_vocab_size:int=5001, sample_size:int=50000):\n",
27
+ " self.corpus = read_corpus(corpus_path)\n",
28
+ " self.max_vocab_size = max_vocab_size\n",
29
+ " self.corpus_vocab = sorted(list(set(self.corpus)))\n",
30
+ " self.corpus_vocab_size = len(self.corpus_vocab)\n",
31
+ " self.stoi = { ch:i for i,ch in enumerate(self.corpus_vocab) }\n",
32
+ " self.itos = { i:ch for i,ch in enumerate(self.corpus_vocab) }\n",
33
+ " self.sample_size = sample_size\n",
34
+ "\n",
35
+ " self.vocab, self.merges = self.train_bpe(self.corpus, self.max_vocab_size, self.sample_size)\n",
36
+ "\n",
37
+ "\n",
38
+ " def get_stats(self, ids):\n",
39
+ " counts = {}\n",
40
+ " for pair in zip(ids, ids[1:]):\n",
41
+ " counts[pair] = counts.get(pair, 0) + 1\n",
42
+ " return counts\n",
43
+ "\n",
44
+ "\n",
45
+ " def merge(self,ids, pair, idx):\n",
46
+ " newids = []\n",
47
+ " i = 0\n",
48
+ " while i < len(ids):\n",
49
+ " if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:\n",
50
+ " newids.append(idx)\n",
51
+ " i += 2\n",
52
+ " else:\n",
53
+ " newids.append(ids[i])\n",
54
+ " i += 1\n",
55
+ " return newids\n",
56
+ "\n",
57
+ "\n",
58
+ "\n",
59
+ " def train_bpe(self, corpus, max_vocab_size, sample_size=None):\n",
60
+ " self.vocab = {idx: bytes([idx]) for idx in range(256)}\n",
61
+ " print(f\"Before Training Vocab length {len(self.vocab)}\")\n",
62
+ " if sample_size :\n",
63
+ " corpus = corpus[:sample_size]\n",
64
+ " num_merges = max_vocab_size - len(self.vocab)\n",
65
+ " print(f\"num_merges required {num_merges}\")\n",
66
+ " tokens = corpus.encode('utf-8')\n",
67
+ " tokens= list(map(int, tokens))\n",
68
+ " ids = list(tokens)\n",
69
+ " self.merges = {} # (int, int) -> int\n",
70
+ " print(f\"Before training: ids length: {len(ids)}\")\n",
71
+ " print(f\"Before training: tokens length: {len(tokens)}\")\n",
72
+ " print(\"Before training: merges length: \", len(self.merges))\n",
73
+ "\n",
74
+ " for i in range(num_merges):\n",
75
+ " stats = self.get_stats(ids)\n",
76
+ " pair = max(stats, key=stats.get)\n",
77
+ " idx = len(self.vocab)+i\n",
78
+ " ids = self.merge(ids, pair, idx)\n",
79
+ " self.merges[pair] = idx\n",
80
+ " # merge the vocab\n",
81
+ " \n",
82
+ " for (p0, p1), idx in self.merges.items():\n",
83
+ " self.vocab[idx] = self.vocab[p0] + self.vocab[p1]\n",
84
+ " print(f\"After training: ids length: {len(ids)}\")\n",
85
+ " print(f\"After training: tokens length: {len(tokens)}\")\n",
86
+ " print(\"After training: merges length: \", len(self.merges))\n",
87
+ " print(f\"After Training Vocab length {len(self.vocab)}\")\n",
88
+ " print(f\"compression ratio: {len(tokens) / len(ids):.2f}X\")\n",
89
+ " return self.vocab, self.merges\n",
90
+ "\n",
91
+ " def encode(self, text):\n",
92
+ " tokens = list(text.encode(\"utf-8\"))\n",
93
+ " while len(tokens) >= 2:\n",
94
+ " stats = self.get_stats(tokens)\n",
95
+ " pair = min(stats, key=lambda p: self.merges.get(p, float(\"inf\")))\n",
96
+ " if pair not in self.merges:\n",
97
+ " break # nothing else can be merged\n",
98
+ " idx = self.merges[pair]\n",
99
+ " tokens = self.merge(tokens, pair, idx)\n",
100
+ " return tokens\n",
101
+ "\n",
102
+ " \n",
103
+ " def decode(self, tokens):\n",
104
+ " tokens = b\"\".join(self.vocab[idx] for idx in tokens)\n",
105
+ " text = tokens.decode(\"utf-8\", errors=\"replace\")\n",
106
+ " return text\n",
107
+ " \n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 4,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "Before Training Vocab length 256\n",
120
+ "num_merges required 4744\n",
121
+ "Before training: ids length: 379218\n",
122
+ "Before training: tokens length: 379218\n",
123
+ "Before training: merges length: 0\n",
124
+ "After training: ids length: 36290\n",
125
+ "After training: tokens length: 379218\n",
126
+ "After training: merges length: 4744\n",
127
+ "After Training Vocab length 5000\n",
128
+ "compression ratio: 10.45X\n",
129
+ "Time taken to train: 96.17453122138977 seconds\n",
130
+ "--------------------------------\n"
131
+ ]
132
+ }
133
+ ],
134
+ "source": [
135
+ "import time\n",
136
+ "\n",
137
+ "start_time = time.time()\n",
138
+ "tokenizer = BPEGujaratiTokenizer(corpus_path=\"gu_corpus.txt\", max_vocab_size=5000, sample_size=150000)\n",
139
+ "end_time = time.time()\n",
140
+ "print(f\"Time taken to train: {end_time - start_time} seconds\")\n",
141
+ "print(\"--------------------------------\")\n"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 15,
147
+ "metadata": {},
148
+ "outputs": [
149
+ {
150
+ "name": "stdout",
151
+ "output_type": "stream",
152
+ "text": [
153
+ "[292, 310, 164, 290, 363, 329, 325, 310, 155, 600]\n",
154
+ "Time taken to encode: 0.0006651878356933594 seconds\n",
155
+ "--------------------------------\n",
156
+ "હું તને પ્રેમ કરું છું\n",
157
+ "Time taken to decode: 0.0004611015319824219 seconds\n",
158
+ "--------------------------------\n",
159
+ "original: હું આજે ખૂબ ખુશ છું.\n",
160
+ "encoded: [292, 310, 1987, 150, 314, 172, 1804, 503, 600, 46]\n",
161
+ "decoded: હું આજે ખૂબ ખુશ છું.\n",
162
+ "True\n",
163
+ "original: તું શું કરે છે? \n",
164
+ "encoded: [279, 1700, 310, 412, 267, 155, 260, 63, 32]\n",
165
+ "decoded: તું શું કરે છે? \n",
166
+ "True\n",
167
+ "original: મને ચા પીવી છે. \n",
168
+ "encoded: [274, 290, 154, 553, 549, 269, 155, 260, 46, 32]\n",
169
+ "decoded: મને ચા પીવી છે. \n",
170
+ "True\n",
171
+ "original: એ બધું સરસ છે. \n",
172
+ "encoded: [479, 334, 343, 310, 184, 1538, 503, 260, 46, 32]\n",
173
+ "decoded: એ બધું સરસ છે. \n",
174
+ "True\n",
175
+ "original: આ પુસ્તક ખૂબ રસપ્રદ છે. \n",
176
+ "encoded: [256, 134, 298, 280, 437, 294, 1990, 172, 316, 326, 1308, 361, 503, 260, 46, 32]\n",
177
+ "decoded: આ પુસ્તક ખૂબ રસપ્રદ છે. \n",
178
+ "True\n",
179
+ "original: તારે ક્યારે આવવું છે? \n",
180
+ "encoded: [279, 344, 149, 482, 347, 1691, 155, 260, 63, 32]\n",
181
+ "decoded: તારે ક્યારે આવવું છે? \n",
182
+ "True\n",
183
+ "original: આ મારો મિત્ર છે. \n",
184
+ "encoded: [256, 134, 1803, 283, 174, 366, 288, 503, 260, 46, 32]\n",
185
+ "decoded: આ મારો મિત્ર છે. \n",
186
+ "True\n",
187
+ "original: હું શાકભાજી લઈ આવ્યો છું. \n",
188
+ "encoded: [292, 1700, 621, 418, 429, 1527, 388, 788, 413, 155, 600, 46, 32]\n",
189
+ "decoded: હું શાકભાજી લઈ આવ્યો છું. \n",
190
+ "True\n",
191
+ "original: આકાશ માં વાદળ છે. \n",
192
+ "encoded: [256, 134, 294, 1089, 307, 285, 181, 405, 345, 503, 260, 46, 32]\n",
193
+ "decoded: આકાશ માં વાદળ છે. \n",
194
+ "True\n",
195
+ "original: શાળા ક્યારે શરૂ થશે? \n",
196
+ "encoded: [330, 888, 391, 482, 182, 268, 1248, 165, 330, 260, 63, 32]\n",
197
+ "decoded: શાળા ક્યારે શરૂ થશે? \n",
198
+ "True\n",
199
+ "original: આ પુસ્તક ખૂબ રસપ્રદ છે.\n",
200
+ "encoded: [256, 134, 298, 280, 437, 294, 1990, 172, 316, 326, 1308, 361, 503, 260, 46]\n",
201
+ "decoded: આ પુસ્તક ખૂબ રસપ્રદ છે.\n",
202
+ "True\n",
203
+ "Time taken to decode: 0.009427070617675781 seconds\n",
204
+ "--------------------------------\n"
205
+ ]
206
+ }
207
+ ],
208
+ "source": [
209
+ "start_time = time.time()\n",
210
+ "print(tokenizer.encode(\"હું તને પ્રેમ કરું છું\"))\n",
211
+ "end_time = time.time()\n",
212
+ "print(f\"Time taken to encode: {end_time - start_time} seconds\")\n",
213
+ "print(\"--------------------------------\")\n",
214
+ "start_time = time.time()\n",
215
+ "print(tokenizer.decode(tokenizer.encode(\"હું તને પ્રેમ કરું છું\")))\n",
216
+ "end_time = time.time()\n",
217
+ "print(f\"Time taken to decode: {end_time - start_time} seconds\")\n",
218
+ "print(\"--------------------------------\")\n",
219
+ "start_time = time.time()\n",
220
+ "sentences = [\"હું આજે ખૂબ ખુશ છું.\",\"તું શું કરે છે? \",\"મને ચા પીવી છે. \",\"એ બધું સરસ છે. \",\"આ પુસ્તક ખૂબ રસપ્રદ છે. \",\"તારે ક્યારે આવવું છે? \",\"આ મારો મિત્ર છે. \",\"હું શાકભાજી લઈ આવ્યો છું. \",\"આકાશ માં વાદળ છે. \",\"શાળા ક્યારે શરૂ થશે? \",'આ પુસ્તક ખૂબ રસપ્રદ છે.']\n",
221
+ "for sentence in sentences:\n",
222
+ " print(\"original: \", sentence)\n",
223
+ " print(\"encoded: \", tokenizer.encode(sentence))\n",
224
+ " print(\"decoded: \", tokenizer.decode(tokenizer.encode(sentence)))\n",
225
+ " print(tokenizer.decode(tokenizer.encode(sentence)) == sentence)\n",
226
+ "end_time = time.time()\n",
227
+ "print(f\"Time taken to decode: {end_time - start_time} seconds\")\n",
228
+ "print(\"--------------------------------\") "
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": []
237
+ }
238
+ ],
239
+ "metadata": {
240
+ "colab": {
241
+ "provenance": []
242
+ },
243
+ "kaggle": {
244
+ "accelerator": "none",
245
+ "dataSources": [
246
+ {
247
+ "datasetId": 6426227,
248
+ "sourceId": 10374225,
249
+ "sourceType": "datasetVersion"
250
+ }
251
+ ],
252
+ "dockerImageVersionId": 30822,
253
+ "isGpuEnabled": false,
254
+ "isInternetEnabled": true,
255
+ "language": "python",
256
+ "sourceType": "notebook"
257
+ },
258
+ "kernelspec": {
259
+ "display_name": "venv",
260
+ "language": "python",
261
+ "name": "python3"
262
+ },
263
+ "language_info": {
264
+ "codemirror_mode": {
265
+ "name": "ipython",
266
+ "version": 3
267
+ },
268
+ "file_extension": ".py",
269
+ "mimetype": "text/x-python",
270
+ "name": "python",
271
+ "nbconvert_exporter": "python",
272
+ "pygments_lexer": "ipython3",
273
+ "version": "3.12.6"
274
+ }
275
+ },
276
+ "nbformat": 4,
277
+ "nbformat_minor": 4
278
+ }