crpatel commited on
Commit
bcc12b1
·
1 Parent(s): 0102368
Files changed (8) hide show
  1. .dockerignore +7 -0
  2. .gitignore +171 -0
  3. Dockerfile +18 -0
  4. app.py +51 -0
  5. encoder.py +114 -0
  6. gu_corpus.txt +0 -0
  7. requirements.txt +3 -0
  8. static/index.html +63 -0
.dockerignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ .git
4
+ .env
5
+ venv/
6
+ .vscode/
7
+ .DS_Store
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 slim-buster image from the Docker Hub
2
+ FROM python:3.9-slim-buster
3
+
4
+ # Set the working directory
5
+ WORKDIR /app
6
+
7
+ # Copy requirements.txt and install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Copy the rest of your application code
12
+ COPY . .
13
+
14
+ # Expose the port FastAPI runs on
15
+ EXPOSE 7860
16
+
17
+ # Command to run the FastAPI application using Uvicorn
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from pydantic import BaseModel
5
+ from encoder import BPEGujaratiTokenizer
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+
8
+ # Define a Pydantic model for the request body
9
+ class EncodeRequest(BaseModel):
10
+ text: str
11
+
12
+ class DecodeRequest(BaseModel):
13
+ tokens: str
14
+
15
+ # Initialize the tokenizer
16
+ tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
17
+
18
+ app = FastAPI()
19
+
20
+ # Add CORS middleware
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # Serve static files (HTML, CSS, JS)
30
+ app.mount("/static", StaticFiles(directory="static"), name="static")
31
+
32
+ @app.get("/", response_class=HTMLResponse)
33
+ async def read_root():
34
+ with open("static/index.html") as f:
35
+ return f.read()
36
+
37
+ @app.post("/encode")
38
+ async def encode_text(request: EncodeRequest):
39
+ """Encodes the input text and returns the tokens."""
40
+ return {"encoded_tokens": tokenizer.encode(request.text)}
41
+
42
+ @app.post("/decode")
43
+ async def decode_tokens(request: DecodeRequest):
44
+ """Decodes the input tokens and returns the original text."""
45
+ print(request.tokens)
46
+ tokens = request.tokens.split(',')
47
+ tokens = list(map(int, tokens))
48
+ print(tokens, [type(token) for token in tokens])
49
+ print(tokens)
50
+ decoded_text = tokenizer.decode(tokens)
51
+ return {"decoded_text": decoded_text}
encoder.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def read_corpus(corpus_path:str):
3
+ with open(corpus_path, 'r', encoding='utf-8') as f:
4
+ text = f.read()
5
+ return text
6
+
7
+
8
+
9
+ class BPEGujaratiTokenizer:
10
+ def __init__(self, corpus_path:str, max_vocab_size:int=5000, sample_size:int=20000):
11
+ self.corpus = read_corpus(corpus_path)
12
+ self.max_vocab_size = max_vocab_size
13
+ self.corpus_vocab = sorted(list(set(self.corpus)))
14
+ self.corpus_vocab_size = len(self.corpus_vocab)
15
+ self.stoi = { ch:i for i,ch in enumerate(self.corpus_vocab) }
16
+ self.itos = { i:ch for i,ch in enumerate(self.corpus_vocab) }
17
+ self.sample_size = sample_size
18
+
19
+ self.vocab, self.merges = self.train_bpe(self.corpus, self.max_vocab_size, self.sample_size)
20
+
21
+
22
+ def get_stats(self, ids):
23
+ counts = {}
24
+ for pair in zip(ids, ids[1:]):
25
+ counts[pair] = counts.get(pair, 0) + 1
26
+ return counts
27
+
28
+
29
+ def merge(self,ids, pair, idx):
30
+ newids = []
31
+ i = 0
32
+ while i < len(ids):
33
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
34
+ newids.append(idx)
35
+ i += 2
36
+ else:
37
+ newids.append(ids[i])
38
+ i += 1
39
+ return newids
40
+
41
+
42
+
43
+ def train_bpe(self, corpus, max_vocab_size, sample_size=None):
44
+ self.vocab = {idx: bytes([idx]) for idx in range(256)}
45
+ if sample_size :
46
+ corpus = corpus[:sample_size]
47
+ num_merges = max_vocab_size - len(self.vocab)
48
+ tokens = corpus.encode('utf-8')
49
+ tokens= list(map(int, tokens))
50
+ ids = list(tokens)
51
+ self.merges = {} # (int, int) -> int
52
+ print(f"Before training: ids length: {len(ids)}")
53
+ print(f"Before training: tokens length: {len(tokens)}")
54
+ print("Before training: merges length: ", len(self.merges))
55
+
56
+ for i in range(num_merges):
57
+ stats = self.get_stats(ids)
58
+ pair = max(stats, key=stats.get)
59
+ idx = len(self.vocab)+i
60
+ ids = self.merge(ids, pair, idx)
61
+ self.merges[pair] = idx
62
+ # merge the vocab
63
+ for (p0, p1), idx in self.merges.items():
64
+ self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
65
+ print(f"After training: ids length: {len(ids)}")
66
+ print(f"After training: tokens length: {len(tokens)}")
67
+ print("After training: merges length: ", len(self.merges))
68
+ print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
69
+ return self.vocab, self.merges
70
+
71
+ def encode(self, text):
72
+ tokens = list(text.encode("utf-8"))
73
+ while len(tokens) >= 2:
74
+ stats = self.get_stats(tokens)
75
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
76
+ if pair not in self.merges:
77
+ break # nothing else can be merged
78
+ idx = self.merges[pair]
79
+ tokens = self.merge(tokens, pair, idx)
80
+ return tokens
81
+
82
+
83
+ def decode(self, tokens):
84
+ tokens = b"".join(self.vocab[idx] for idx in tokens)
85
+ text = tokens.decode("utf-8", errors="replace")
86
+ return text
87
+
88
+ import time
89
+ if __name__ == "__main__":
90
+ start_time = time.time()
91
+ tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
92
+ end_time = time.time()
93
+ print(f"Time taken to train: {end_time - start_time} seconds")
94
+ print("--------------------------------")
95
+ start_time = time.time()
96
+ print(tokenizer.encode("હું તને પ્રેમ કરું છું"))
97
+ end_time = time.time()
98
+ print(f"Time taken to encode: {end_time - start_time} seconds")
99
+ print("--------------------------------")
100
+ start_time = time.time()
101
+ print(tokenizer.decode(tokenizer.encode("હું તને પ્રેમ કરું છું")))
102
+ end_time = time.time()
103
+ print(f"Time taken to decode: {end_time - start_time} seconds")
104
+ print("--------------------------------")
105
+ start_time = time.time()
106
+ sentences = ["હું આજે ખૂબ ખુશ છું.","તું શું કરે છે? ","મને ચા પીવી છે. ","એ બધું સરસ છે. ","આ પુસ્તક ખૂબ રસપ્રદ છે. ","તારે ક્યારે આવવું છે? ","આ મારો મિત્ર છે. ","હું શાકભાજી લઈ આવ્યો છું. ","આકાશ માં વાદળ છે. ","શાળા ક્યારે શરૂ થશે? ",'આ પુસ્તક ખૂબ રસપ્રદ છે.']
107
+ for sentence in sentences:
108
+ print("original: ", sentence)
109
+ print("encoded: ", tokenizer.encode(sentence))
110
+ print("decoded: ", tokenizer.decode(tokenizer.encode(sentence)))
111
+ print(tokenizer.decode(tokenizer.encode(sentence)) == sentence)
112
+ end_time = time.time()
113
+ print(f"Time taken to decode: {end_time - start_time} seconds")
114
+ print("--------------------------------")
gu_corpus.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ python-multipart
3
+ uvicorn
static/index.html ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Gujarati BPE Tokenizer</title>
7
+ <script>
8
+ async function encode() {
9
+ const text = document.getElementById("inputText").value;
10
+ document.getElementById("originalSentence").innerText = text;
11
+ const response = await fetch("/encode", {
12
+ method: "POST",
13
+ headers: {
14
+ "Content-Type": "application/json"
15
+ },
16
+ body: JSON.stringify({ text })
17
+ });
18
+ const data = await response.json();
19
+ document.getElementById("encodedTokens").innerText = data.encoded_tokens;
20
+ document.getElementById("originalTokens").innerText = data.encoded_tokens;
21
+ }
22
+
23
+ async function decode() {
24
+ const tokens = document.getElementById("inputTokens").value;
25
+ document.getElementById("inputTokensDisplay").innerText = tokens;
26
+ const response = await fetch("/decode", {
27
+ method: "POST",
28
+ headers: {
29
+ "Content-Type": "application/json"
30
+ },
31
+ body: JSON.stringify({ tokens })
32
+ });
33
+ const data = await response.json();
34
+ document.getElementById("decodedText").innerText = data.decoded_text;
35
+ }
36
+
37
+ function resetFields() {
38
+ document.getElementById("inputText").value = '';
39
+ document.getElementById("inputTokens").value = '';
40
+ document.getElementById("encodedTokens").innerText = '';
41
+ document.getElementById("originalSentence").innerText = '';
42
+ document.getElementById("decodedText").innerText = '';
43
+ document.getElementById("inputTokensDisplay").innerText = '';
44
+ }
45
+ </script>
46
+ </head>
47
+ <body>
48
+ <h1>Basic Gujarati BPE Tokenizer (5000 Vocab)</h1>
49
+ <h2>Encode Text</h2>
50
+ <input type="text" id="inputText" placeholder="Enter text to encode...">
51
+ <button onclick="encode()">Encode</button>
52
+ <p>Encoded Tokens: <span id="encodedTokens"></span></p>
53
+ <p>Original Sentence: <span id="originalSentence"></span></p>
54
+
55
+ <h2>Decode Tokens</h2>
56
+ <input type="text" id="inputTokens" placeholder="Enter tokens to decode...">
57
+ <button onclick="decode()">Decode</button>
58
+ <p>Decoded Text: <span id="decodedText"></span></p>
59
+ <p>Original Tokens Inputted: <span id="inputTokensDisplay"></span></p>
60
+
61
+ <button onclick="resetFields()">Reset All</button>
62
+ </body>
63
+ </html>