Spaces:

iamtatsuki05
/

ocr_and_translate_en2jp

Sleeping

App Files Files Community

iamtatsuki05 commited on May 9, 2024

Commit

b86e5c3

verified ·

1 Parent(s): c3c4f08

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (17) hide show

.dockerignore +9 -0
Dockerfile +71 -0
Makefile +0 -0
compose.yaml +29 -0
docker/cpu/Dockerfile +68 -0
images/app_sample.png +0 -0
notebooks/helloworld.ipynb +51 -0
poetry.lock +0 -0
pyproject.toml +58 -0
scripts/genarate_tranrate_df.py +36 -0
src/__init__.py +0 -0
src/app.py +53 -0
src/ocr_and_translate_en2jp/__init__.py +0 -0
src/ocr_and_translate_en2jp/genarate_tranrate_df.py +59 -0
src/ocr_and_translate_en2jp/ocr.py +32 -0
src/ocr_and_translate_en2jp/translate.py +41 -0
tests/ocr_and_translate_en2jp/__init__.py +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+html/*
+.DS_Store
+.venv
+*.swp
+.mypy_cache
+.pytest_cache
+.ipynb_checkpoints
+__pycache__

Dockerfile ADDED Viewed

	@@ -0,0 +1,71 @@

+FROM ubuntu:22.04 AS base
+ARG PYTHON_VERSION=3.10
+ENV DEBIAN_FRONTEND=noninteractive
+ENV WORKDIR /app/
+WORKDIR /opt
+# install dev tools
+RUN apt-get update && apt-get install -y \
+  vim neovim nano \
+  git git-lfs \
+  zip unzip \
+  curl wget make build-essential xz-utils file tree \
+  sudo \
+  dnsutils \
+  tzdata language-pack-ja \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+# for Japanese settings
+# ENV TZ Asia/Tokyo
+# ENV LANG ja_JP.utf8
+# for US settings
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US
+# install Python
+RUN apt-get update && apt-get -yV upgrade && DEBIAN_FRONTEND=noninteractive apt-get -yV install \
+  build-essential libssl-dev libffi-dev \
+  python${PYTHON_VERSION} python${PYTHON_VERSION}-distutils python${PYTHON_VERSION}-dev \
+  && ln -s /usr/bin/python${PYTHON_VERSION} /usr/local/bin/python3 \
+  && ln -s /usr/bin/python${PYTHON_VERSION} /usr/local/bin/python \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+## install pip
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
+  && python3 get-pip.py \
+  && pip3 --no-cache-dir install --upgrade pip
+## install Poetry
+RUN curl -sSL https://install.python-poetry.org | python3 -
+ENV PATH $PATH:/root/.local/bin
+RUN poetry config virtualenvs.create true \
+  && poetry config virtualenvs.in-project false
+WORKDIR ${WORKDIR}
+# install python packages
+COPY poetry.lock pyproject.toml ./
+COPY src ./src
+RUN poetry install --no-dev
+FROM base AS dev
+WORKDIR ${WORKDIR}
+# install python packages
+COPY poetry.lock pyproject.toml ./
+COPY src ./src
+RUN poetry install
+# install ocr tools
+RUN apt-get update && apt-get install -y \
+  tesseract-ocr tesseract-ocr-jpn \
+  poppler-utils
+# Hugging Face Hub Settings
+CMD ["poetry", "run", "streamlit", "run", "src/app.py", "--server.port", "7860"]

Makefile ADDED Viewed

File without changes

compose.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+version: "3.2"
+services:
+  ocr_and_translate_en2jp:
+    tty: true
+    stdin_open: true
+    user: root
+    working_dir: /app
+    build:
+      context: .
+      dockerfile: docker/cpu/Dockerfile
+      target: dev
+      # secrets:
+      #   - github_token
+      args:
+        progress: plain
+    volumes:
+      - type: bind
+        source: ./
+        target: /app
+    ports:
+      - "8501:8501"
+    command:
+      poetry run streamlit run src/app.py
+    environment:
+        PYTHONPATH: "/app/src"
+        PYTHONUNBUFFERED: 1
+# secrets:
+#   github_token:
+#     file: ${HOME}/.git-credentials

docker/cpu/Dockerfile ADDED Viewed

	@@ -0,0 +1,68 @@

+FROM ubuntu:22.04 AS base
+ARG PYTHON_VERSION=3.10
+ENV DEBIAN_FRONTEND=noninteractive
+ENV WORKDIR /app/
+WORKDIR /opt
+# install dev tools
+RUN apt-get update && apt-get install -y \
+  vim neovim nano \
+  git git-lfs \
+  zip unzip \
+  curl wget make build-essential xz-utils file tree \
+  sudo \
+  dnsutils \
+  tzdata language-pack-ja \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+# for Japanese settings
+# ENV TZ Asia/Tokyo
+# ENV LANG ja_JP.utf8
+# for US settings
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US
+# install Python
+RUN apt-get update && apt-get -yV upgrade && DEBIAN_FRONTEND=noninteractive apt-get -yV install \
+  build-essential libssl-dev libffi-dev \
+  python${PYTHON_VERSION} python${PYTHON_VERSION}-distutils python${PYTHON_VERSION}-dev \
+  && ln -s /usr/bin/python${PYTHON_VERSION} /usr/local/bin/python3 \
+  && ln -s /usr/bin/python${PYTHON_VERSION} /usr/local/bin/python \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+## install pip
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
+  && python3 get-pip.py \
+  && pip3 --no-cache-dir install --upgrade pip
+## install Poetry
+RUN curl -sSL https://install.python-poetry.org | python3 -
+ENV PATH $PATH:/root/.local/bin
+RUN poetry config virtualenvs.create true \
+  && poetry config virtualenvs.in-project false
+WORKDIR ${WORKDIR}
+# install python packages
+COPY poetry.lock pyproject.toml ./
+COPY src ./src
+RUN poetry install --no-dev
+FROM base AS dev
+WORKDIR ${WORKDIR}
+# install python packages
+COPY poetry.lock pyproject.toml ./
+COPY src ./src
+RUN poetry install
+# install ocr tools
+RUN apt-get update && apt-get install -y \
+  tesseract-ocr tesseract-ocr-jpn \
+  poppler-utils

images/app_sample.png ADDED Viewed

notebooks/helloworld.ipynb ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18edcb70-64a5-4d17-94c0-a86ecc435be4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hello world\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"hello world\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b90e3e1f-5aa1-43a0-b9e2-1d1bcaf9ff94",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,58 @@

+[tool.poetry]
+name = "ocr_and_translate_en2jp"
+version = "0.1.0"
+description = ""
+authors = ["iamtatsuki05 <[email protected]>"]
+packages = [
+    { include = "ocr_and_translate_en2jp", from = "src/" },
+]
+[tool.poetry.dependencies]
+python = "^3.10"
+python-dotenv = "^1.0.0"
+setuptools = "^69.0.3"
+fire = "^0.5.0"
+pydantic = "^2.5.3"
+beautifulsoup4 = "^4.12.2"
+selenium = "^4.16.0"
+fastapi = "^0.108.0"
+uvicorn = "^0.25.0"
+matplotlib = "^3.5.1"
+pandas = "^1.4.2"
+seaborn = "^0.11.2"
+japanize-matplotlib = "^1.1.3"
+numpy = "^1.22.3"
+jupyterlab = "^3.3.4"
+tqdm = "^4.64.0"
+scikit-learn = "^1.1.1"
+openpyxl = "^3.1.2"
+pytesseract = "^0.3.10"
+pdf2image = "^1.16.0"
+streamlit = "^1.34.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.0.0"
+ipykernel = ">=6.13.0"
+autopep8 = ">=1.6.0"
+autoflake = ">=1.4"
+flake8 = ">=4.0.1"
+flake8-isort = ">=4.1.1"
+flake8-print = ">=4.0.0"
+isort = ">=5.10.1"
+black = ">=22.10.0"
+mypy = ">=0.971"
+tox = ">=3.25.1"
+pre-commit = ">=3.3.3"
+nbstripout = "0.6.1"
+[tool.isort]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = true
+[tool.black]
+skip-string-normalization = true
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

scripts/genarate_tranrate_df.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pathlib import Path
+from typing import Optional, Union
+import fire
+from ocr_and_translate_en2jp.genarate_tranrate_df import df_generator, output_results
+def df_generator_wrapper(
+    file_path: Union[str, Path],
+    output_dir: Optional[Union[str, Path]] = './',
+    output_file_name: str = 'output',
+    max_words: Optional[int] = None,
+    do_shuffle_output: bool = False,
+    seed: Optional[int] = None,
+    do_output_csv: Optional[bool] = True,
+    do_output_excel: Optional[bool] = False,
+    do_clean_noise_data: Optional[bool] = True,
+) -> None:
+    file_path = Path(file_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    df = df_generator(
+        file_path=file_path,
+        max_words=max_words,
+        do_shuffle_output=do_shuffle_output,
+        seed=seed,
+        do_clean_noise_data=do_clean_noise_data,
+    )
+    output_results(df, output_dir, output_file_name, do_output_csv, do_output_excel)
+if __name__ == '__main__':
+    fire.Fire(df_generator)

src/__init__.py ADDED Viewed

File without changes

src/app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import shutil
+import tempfile
+from io import BytesIO
+from pathlib import Path
+import streamlit as st
+from ocr_and_translate_en2jp.genarate_tranrate_df import df_generator
+st.title('OCR and Translation App')
+uploaded_file = st.file_uploader('Choose a file')
+if uploaded_file is not None:
+    with tempfile.NamedTemporaryFile(
+        delete=False, suffix=Path(uploaded_file.name).suffix
+    ) as tmpfile:
+        shutil.copyfileobj(uploaded_file, tmpfile)
+        uploaded_file_path = tmpfile.name
+    max_words = st.number_input(
+        'Maximum number of words to process', min_value=1, value=50
+    )
+    do_shuffle = st.checkbox('Shuffle output')
+    seed = (
+        st.number_input('Seed for shuffling', min_value=0, value=42)
+        if do_shuffle
+        else None
+    )
+    do_clean_noise_data = st.checkbox('Clean noise data', value=True)
+    if st.button('Process'):
+        df = df_generator(
+            uploaded_file_path, max_words, do_shuffle, seed, do_clean_noise_data
+        )
+        st.dataframe(df)
+        csv = df.to_csv(index=False).encode('utf-8')
+        st.download_button(
+            label='Download data as CSV',
+            data=csv,
+            file_name='processed_data.csv',
+            mime='text/csv',
+        )
+        # REF: https://qiita.com/nyakiri_0726/items/2ae8cfb926c48072b190
+        df.to_excel(buf := BytesIO(), index=False)
+        st.download_button(
+            label="Download data as Excel",
+            data=buf.getvalue(),
+            file_name='processed_data.xlsx',
+            mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        )

src/ocr_and_translate_en2jp/__init__.py ADDED Viewed

File without changes

src/ocr_and_translate_en2jp/genarate_tranrate_df.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from pathlib import Path
+from typing import Optional, Union
+import pandas as pd
+from tqdm.auto import tqdm
+from ocr_and_translate_en2jp.ocr import ocr_image
+from ocr_and_translate_en2jp.translate import get_translation, get_word_type_japanese
+tqdm.pandas()
+def clean_data(df: pd.DataFrame) -> pd.DataFrame:
+    df = df[df['word'].str.contains('[a-zA-Z]', na=False)].reset_index(drop=True)
+    df['word_pos'] = df['word'].progress_apply(get_word_type_japanese)
+    df = df[df['word_pos'].notnull()].reset_index(drop=True)
+    df['word_JP'] = df['word'].progress_apply(get_translation)
+    return df[df['word_JP'].notnull()].reset_index(drop=True)
+def output_results(
+    df: pd.DataFrame,
+    output_dir: Path,
+    output_file_name: str,
+    do_output_csv: bool,
+    do_output_excel: bool,
+) -> None:
+    if do_output_csv:
+        output_path = output_dir / f'{output_file_name}.csv'
+        df.to_csv(output_path, index=False)
+    if do_output_excel:
+        output_path = output_dir / f'{output_file_name}.xlsx'
+        df.to_excel(output_path, index=False)
+def df_generator(
+    file_path: Union[str, Path],
+    max_words: Optional[int] = None,
+    do_shuffle_output: bool = False,
+    seed: Optional[int] = None,
+    do_clean_noise_data: Optional[bool] = True,
+) -> pd.DataFrame:
+    file_path = Path(file_path)
+    ocr_result = list(set(ocr_image(file_path, is_return_list=True)))
+    df = pd.DataFrame(ocr_result, columns=['word'])
+    if do_clean_noise_data:
+        df = clean_data(df)
+    max_words = max_words or len(df)
+    if max_words > len(df):
+        max_words = len(df)
+    df = (
+        df.sample(n=max_words, random_state=seed, ignore_index=True)
+        if do_shuffle_output
+        else df.head(max_words)
+    )
+    return df

src/ocr_and_translate_en2jp/ocr.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pathlib import Path
+from typing import Final, Optional, Union
+from pdf2image import convert_from_path
+from PIL import Image
+from pytesseract import pytesseract
+pytesseract.tesseract_cmd = '/usr/bin/tesseract'
+PICTURE_EXTENSIONS: Final[list[str]] = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
+def load_file(file_path: Union[str, Path]) -> Optional[Union[Image.Image, list]]:
+    file_path = Path(file_path)
+    file_extension = file_path.suffix
+    if file_extension == '.pdf':
+        return convert_from_path(str(file_path))
+    elif file_extension in PICTURE_EXTENSIONS:
+        return [Image.open(str(file_path))]
+    return None
+def ocr_image(
+    file_path: Union[str, Path], is_return_list: Optional[bool] = False
+) -> Union[str, list]:
+    images = load_file(file_path)
+    if images is None:
+        return []
+    ocr_result = ''.join(
+        pytesseract.image_to_string(img, lang='eng+jpn') for img in images
+    )
+    return ocr_result.split() if is_return_list else ocr_result

src/ocr_and_translate_en2jp/translate.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Final, Optional
+import requests
+from bs4 import BeautifulSoup
+BASE_URL: Final[str] = 'https://ejje.weblio.jp/content/'
+WORD_TYPE_JAPANESE_HTML_CLASS: Final[str] = 'content-explanation'
+TRANSLATION_HTML_CLASS: Final[str] = 'content-explanation ej'
+def generate_soup_from_url(word: str) -> BeautifulSoup:
+    response = requests.get(BASE_URL + word)
+    return BeautifulSoup(response.text, 'html.parser')
+def get_text_from_class(soup: BeautifulSoup, class_name: str) -> Optional[str]:
+    try:
+        return soup.find(class_=class_name).get_text().strip()
+    except AttributeError:
+        return None
+def get_word_type_japanese(word: str) -> Optional[str]:
+    soup = generate_soup_from_url(word)
+    text = get_text_from_class(soup, WORD_TYPE_JAPANESE_HTML_CLASS)
+    return text.split(' ')[0] if text else None
+def get_translation(word: str) -> Optional[str]:
+    soup = generate_soup_from_url(word)
+    return get_text_from_class(soup, TRANSLATION_HTML_CLASS)
+def translate_english_to_japanese(word: str) -> dict[str, Optional[str]]:
+    japanese_word_type = get_word_type_japanese(word)
+    japanese_translation = get_translation(word)
+    return {
+        'word': word,
+        'japanese_word': japanese_translation,
+        'word_type': japanese_word_type,
+    }

tests/ocr_and_translate_en2jp/__init__.py ADDED Viewed

File without changes