Spaces:

crpatel
/

Gujarati-BPE-Tokenizer

Sleeping

App Files Files Community

crpatel commited on Jan 5

Commit

bcc12b1

1 Parent(s): 0102368

app

Browse files

Files changed (8) hide show

.dockerignore +7 -0
.gitignore +171 -0
Dockerfile +18 -0
app.py +51 -0
encoder.py +114 -0
gu_corpus.txt +0 -0
requirements.txt +3 -0
static/index.html +63 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__
+*.pyc
+.git
+.env
+venv/
+.vscode/
+.DS_Store

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Use the official Python 3.9 slim-buster image from the Docker Hub
+FROM python:3.9-slim-buster
+# Set the working directory
+WORKDIR /app
+# Copy requirements.txt and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application code
+COPY . .
+# Expose the port FastAPI runs on
+EXPOSE 7860
+# Command to run the FastAPI application using Uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from encoder import BPEGujaratiTokenizer
+from fastapi.middleware.cors import CORSMiddleware
+# Define a Pydantic model for the request body
+class EncodeRequest(BaseModel):
+    text: str
+class DecodeRequest(BaseModel):
+    tokens: str
+# Initialize the tokenizer
+tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Serve static files (HTML, CSS, JS)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    with open("static/index.html") as f:
+        return f.read()
+@app.post("/encode")
+async def encode_text(request: EncodeRequest):
+    """Encodes the input text and returns the tokens."""
+    return {"encoded_tokens": tokenizer.encode(request.text)}
+@app.post("/decode")
+async def decode_tokens(request: DecodeRequest):
+    """Decodes the input tokens and returns the original text."""
+    print(request.tokens)
+    tokens = request.tokens.split(',')
+    tokens = list(map(int, tokens))
+    print(tokens, [type(token) for token in tokens])
+    print(tokens)
+    decoded_text = tokenizer.decode(tokens)
+    return {"decoded_text": decoded_text}

encoder.py ADDED Viewed

	@@ -0,0 +1,114 @@

+def read_corpus(corpus_path:str):
+    with open(corpus_path, 'r', encoding='utf-8') as f:
+        text = f.read()
+    return text
+class BPEGujaratiTokenizer:
+    def __init__(self, corpus_path:str, max_vocab_size:int=5000, sample_size:int=20000):
+        self.corpus = read_corpus(corpus_path)
+        self.max_vocab_size = max_vocab_size
+        self.corpus_vocab = sorted(list(set(self.corpus)))
+        self.corpus_vocab_size = len(self.corpus_vocab)
+        self.stoi = { ch:i for i,ch in enumerate(self.corpus_vocab) }
+        self.itos = { i:ch for i,ch in enumerate(self.corpus_vocab) }
+        self.sample_size = sample_size
+        self.vocab, self.merges = self.train_bpe(self.corpus, self.max_vocab_size, self.sample_size)
+    def get_stats(self, ids):
+        counts = {}
+        for pair in zip(ids, ids[1:]):
+            counts[pair] = counts.get(pair, 0) + 1
+        return counts
+    def merge(self,ids, pair, idx):
+        newids = []
+        i = 0
+        while i < len(ids):
+            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+                newids.append(idx)
+                i += 2
+            else:
+                newids.append(ids[i])
+                i += 1
+        return newids
+    def train_bpe(self, corpus, max_vocab_size, sample_size=None):
+        self.vocab = {idx: bytes([idx]) for idx in range(256)}
+        if sample_size :
+            corpus = corpus[:sample_size]
+        num_merges = max_vocab_size - len(self.vocab)
+        tokens = corpus.encode('utf-8')
+        tokens= list(map(int, tokens))
+        ids = list(tokens)
+        self.merges = {} # (int, int) -> int
+        print(f"Before training: ids length: {len(ids)}")
+        print(f"Before training: tokens length: {len(tokens)}")
+        print("Before training: merges length: ", len(self.merges))
+        for i in range(num_merges):
+            stats = self.get_stats(ids)
+            pair = max(stats, key=stats.get)
+            idx = len(self.vocab)+i
+            ids = self.merge(ids, pair, idx)
+            self.merges[pair] = idx
+        # merge the vocab
+        for (p0, p1), idx in self.merges.items():
+            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
+        print(f"After training: ids length: {len(ids)}")
+        print(f"After training: tokens length: {len(tokens)}")
+        print("After training: merges length: ", len(self.merges))
+        print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
+        return self.vocab, self.merges
+    def encode(self, text):
+        tokens = list(text.encode("utf-8"))
+        while len(tokens) >= 2:
+            stats = self.get_stats(tokens)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if pair not in self.merges:
+                break # nothing else can be merged
+            idx = self.merges[pair]
+            tokens = self.merge(tokens, pair, idx)
+        return tokens
+    def decode(self, tokens):
+        tokens = b"".join(self.vocab[idx] for idx in tokens)
+        text = tokens.decode("utf-8", errors="replace")
+        return text
+import time
+if __name__ == "__main__":
+    start_time = time.time()
+    tokenizer = BPEGujaratiTokenizer(corpus_path="gu_corpus.txt", max_vocab_size=5000, sample_size=20000)
+    end_time = time.time()
+    print(f"Time taken to train: {end_time - start_time} seconds")
+    print("--------------------------------")
+    start_time = time.time()
+    print(tokenizer.encode("હું તને પ્રેમ કરું છું"))
+    end_time = time.time()
+    print(f"Time taken to encode: {end_time - start_time} seconds")
+    print("--------------------------------")
+    start_time = time.time()
+    print(tokenizer.decode(tokenizer.encode("હું તને પ્રેમ કરું છું")))
+    end_time = time.time()
+    print(f"Time taken to decode: {end_time - start_time} seconds")
+    print("--------------------------------")
+    start_time = time.time()
+    sentences = ["હું આજે ખૂબ ખુશ છું.","તું શું કરે છે? ","મને ચા પીવી છે. ","એ બધું સરસ છે. ","આ પુસ્તક ખૂબ રસપ્રદ છે. ","તારે ક્યારે આવવું છે? ","આ મારો મિત્ર છે. ","હું શાકભાજી લઈ આવ્યો છું. ","આકાશ માં વાદળ છે. ","શાળા ક્યારે શરૂ થશે? ",'આ પુસ્તક ખૂબ રસપ્રદ છે.']
+    for sentence in sentences:
+        print("original: ", sentence)
+        print("encoded: ", tokenizer.encode(sentence))
+        print("decoded: ", tokenizer.decode(tokenizer.encode(sentence)))
+        print(tokenizer.decode(tokenizer.encode(sentence)) == sentence)
+    end_time = time.time()
+    print(f"Time taken to decode: {end_time - start_time} seconds")
+    print("--------------------------------")

gu_corpus.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+python-multipart
+uvicorn

static/index.html ADDED Viewed

	@@ -0,0 +1,63 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Gujarati BPE Tokenizer</title>
+    <script>
+        async function encode() {
+            const text = document.getElementById("inputText").value;
+            document.getElementById("originalSentence").innerText = text;
+            const response = await fetch("/encode", {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json"
+                },
+                body: JSON.stringify({ text })
+            });
+            const data = await response.json();
+            document.getElementById("encodedTokens").innerText = data.encoded_tokens;
+            document.getElementById("originalTokens").innerText = data.encoded_tokens;
+        }
+        async function decode() {
+            const tokens = document.getElementById("inputTokens").value;
+            document.getElementById("inputTokensDisplay").innerText = tokens;
+            const response = await fetch("/decode", {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json"
+                },
+                body: JSON.stringify({ tokens })
+            });
+            const data = await response.json();
+            document.getElementById("decodedText").innerText = data.decoded_text;
+        }
+        function resetFields() {
+            document.getElementById("inputText").value = '';
+            document.getElementById("inputTokens").value = '';
+            document.getElementById("encodedTokens").innerText = '';
+            document.getElementById("originalSentence").innerText = '';
+            document.getElementById("decodedText").innerText = '';
+            document.getElementById("inputTokensDisplay").innerText = '';
+        }
+    </script>
+</head>
+<body>
+    <h1>Basic Gujarati BPE Tokenizer (5000 Vocab)</h1>
+    <h2>Encode Text</h2>
+    <input type="text" id="inputText" placeholder="Enter text to encode...">
+    <button onclick="encode()">Encode</button>
+    <p>Encoded Tokens: <span id="encodedTokens"></span></p>
+    <p>Original Sentence: <span id="originalSentence"></span></p>
+    <h2>Decode Tokens</h2>
+    <input type="text" id="inputTokens" placeholder="Enter tokens to decode...">
+    <button onclick="decode()">Decode</button>
+    <p>Decoded Text: <span id="decodedText"></span></p>
+    <p>Original Tokens Inputted: <span id="inputTokensDisplay"></span></p>
+    <button onclick="resetFields()">Reset All</button>
+</body>
+</html>