Spaces:

flax-community
/

dalle-mini

Running

Pedro Cuenca commited on Aug 27, 2021

Commit

6047b49

1 Parent(s): ecf5f29

Notebooks that demonstrate streaming encoding

Using either Huggingface Datasets, or webdataset.

Note that parallel processing is not possible for Huggingface Datasets
in streaming mode. A local copy or the use of webdataset are preferred
for large streaming datasets.

Files changed (2) hide show

dev/encoding/vqgan-jax-encoding-streaming.ipynb +0 -0
dev/encoding/vqgan-jax-encoding-webdataset.ipynb +408 -0

dev/encoding/vqgan-jax-encoding-streaming.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

dev/encoding/vqgan-jax-encoding-webdataset.ipynb ADDED Viewed

	@@ -0,0 +1,408 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d0b72877",
+   "metadata": {},
+   "source": [
+    "# vqgan-jax-encoding-alamy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba7b31e6",
+   "metadata": {},
+   "source": [
+    "Encoding notebook for Alamy dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3b59489e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import torch\n",
+    "import torchvision.transforms as T\n",
+    "import torchvision.transforms.functional as TF\n",
+    "from torchvision.transforms import InterpolationMode\n",
+    "import math\n",
+    "\n",
+    "import webdataset as wds\n",
+    "\n",
+    "import jax\n",
+    "from jax import pmap"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7c4c1e6",
+   "metadata": {},
+   "source": [
+    "## Dataset and Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13c6631b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shards = 'https://s3.us-west-1.wasabisys.com/doodlebot-wasabi/datasets/alamy/webdataset/alamy-{000..895}.tar'\n",
+    "\n",
+    "# Enable curl retries to try to work around temporary network / server errors.\n",
+    "# This shouldn't be necessary when using reliable servers.\n",
+    "shards = f'pipe:curl -s --retry 5 --retry-delay 5 -L {shards} || true'\n",
+    "\n",
+    "length = 44710810    # estimate\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Output directory for encoded files\n",
+    "encoded_output = Path.home()/'data'/'alamy'/'encoded'\n",
+    "\n",
+    "batch_size = 128           # Per device\n",
+    "num_workers = 8            # Using larger numbers seemed to be less reliable in this case."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3435fb85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bs = batch_size * jax.device_count()    # Use a smaller size for testing\n",
+    "batches = math.ceil(length / bs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "669b35df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def center_crop(image, max_size=256):\n",
+    "    # Note: we allow upscaling too. We should exclude small images.    \n",
+    "    image = TF.resize(image, max_size, interpolation=InterpolationMode.LANCZOS)\n",
+    "    image = TF.center_crop(image, output_size=2 * [max_size])\n",
+    "    return image\n",
+    "\n",
+    "preprocess_image = T.Compose([\n",
+    "    center_crop,\n",
+    "    T.ToTensor(),\n",
+    "    lambda t: t.permute(1, 2, 0)   # Reorder, we need dimensions last\n",
+    "])\n",
+    "\n",
+    "# Is there a shortcut for this?\n",
+    "def extract_from_json(item):\n",
+    "    item['caption'] = item['json']['caption']\n",
+    "    item['url'] = item['json']['url']\n",
+    "    return item"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "369d9719",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Log exceptions to a hardcoded file\n",
+    "def ignore_and_log(exn):\n",
+    "    with open('errors.txt', 'a') as f:\n",
+    "        f.write(f'{exn}\\n')\n",
+    "    return True\n",
+    "\n",
+    "# Or simply use `wds.ignore_and_continue`\n",
+    "exception_handler = ignore_and_log\n",
+    "exception_handler = wds.warn_and_continue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5149b6d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = wds.WebDataset(shards,\n",
+    "                         length=batches,                    # Hint so `len` is implemented\n",
+    "                         shardshuffle=False,                # Keep same order for encoded files for easier bookkeeping\n",
+    "                         handler=exception_handler,   # Ignore read errors instead of failing. See also: `warn_and_continue`\n",
+    ")\n",
+    "\n",
+    "dataset = (dataset           \n",
+    "      .decode('pil')                     # decode image with PIL\n",
+    "      .map(extract_from_json)\n",
+    "      .map_dict(jpg=preprocess_image, handler=exception_handler)\n",
+    "      .to_tuple('url', 'jpg', 'caption') # filter to keep only url (for reference), image, caption.\n",
+    "      .batched(bs))                      # better to batch in the dataset (but we could also do it in the dataloader) - this arg does not affect speed and we could remove it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8cac98cb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 8min 26s, sys: 12.5 s, total: 8min 38s\n",
+      "Wall time: 14.4 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "urls, images, captions = next(iter(dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "cd268fbf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1024, 256, 256, 3])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "images.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44d50a51",
+   "metadata": {},
+   "source": [
+    "### Torch DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e2df5e13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dl = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=num_workers)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a354472b",
+   "metadata": {},
+   "source": [
+    "## VQGAN-JAX model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2fcf01d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vqgan_jax.modeling_flax_vqgan import VQModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9daa636d",
+   "metadata": {},
+   "source": [
+    "We'll use a VQGAN trained with Taming Transformers and converted to a JAX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "47a8b818",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62ad01c3",
+   "metadata": {},
+   "source": [
+    "## Encoding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20357f74",
+   "metadata": {},
+   "source": [
+    "Encoding is really simple using `shard` to automatically distribute \"superbatches\" across devices, and `pmap`. This is all it takes to create our encoding function, that will be jitted on first use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6686b004",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from flax.training.common_utils import shard\n",
+    "from functools import partial"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "322a4619",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@partial(jax.pmap, axis_name=\"batch\")\n",
+    "def encode(batch):\n",
+    "    # Not sure if we should `replicate` params, does not seem to have any effect\n",
+    "    _, indices = model.encode(batch)\n",
+    "    return indices"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14375a41",
+   "metadata": {},
+   "source": [
+    "### Encoding loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ff6c10d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "def encode_captioned_dataset(dataloader, output_dir, save_every=14):\n",
+    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    # Saving strategy:\n",
+    "    # - Create a new file every so often to prevent excessive file seeking.\n",
+    "    # - Save each batch after processing.\n",
+    "    # - Keep the file open until we are done with it.\n",
+    "    file = None        \n",
+    "    for n, (urls, images, captions) in enumerate(tqdm(dataloader)):\n",
+    "        if (n % save_every == 0):\n",
+    "            if file is not None:\n",
+    "                file.close()\n",
+    "            split_num = n // save_every\n",
+    "            file = open(output_dir/f'split_{split_num:05x}.jsonl', 'w')\n",
+    "\n",
+    "        images = shard(images.numpy().squeeze())\n",
+    "        encoded = encode(images)\n",
+    "        encoded = encoded.reshape(-1, encoded.shape[-1])\n",
+    "\n",
+    "        encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n",
+    "        batch_df = pd.DataFrame.from_dict({\"url\": urls, \"caption\": captions, \"encoding\": encoded_as_string})\n",
+    "        batch_df.to_json(file, orient='records', lines=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09ff75a3",
+   "metadata": {},
+   "source": [
+    "Create a new file every 318 iterations. This should produce splits of ~500 MB each, when using a total batch size of 1024."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "96222bb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_every = 318"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7704863d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|█▌                                                           | 1085/43663 [31:58<20:43:42,  1.75s/it]"
+     ]
+    }
+   ],
+   "source": [
+    "encode_captioned_dataset(dl, encoded_output, save_every=save_every)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8953dd84",
+   "metadata": {},
+   "source": [
+    "----"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "db471c52d602b4f5f40ecaf278e88ccfef85c29d0a1a07185b0d51fc7acf4e26"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}