Training in progress, step 1000

Browse files

Files changed (9) hide show

.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb +231 -69
fine-tune-whisper-streaming.ipynb +11 -138
pytorch_model.bin +1 -1
runs/Dec14_14-23-12_132-145-140-45/events.out.tfevents.1671027857.132-145-140-45.618344.0 +2 -2
runs/Dec14_18-54-17_132-145-140-45/1671044156.1678598/events.out.tfevents.1671044156.132-145-140-45.618344.3 +3 -0
runs/Dec14_18-54-17_132-145-140-45/events.out.tfevents.1671044156.132-145-140-45.618344.2 +3 -0
runs/Dec14_19-08-48_132-145-140-45/1671044964.476709/events.out.tfevents.1671044964.132-145-140-45.1598466.1 +3 -0
runs/Dec14_19-08-48_132-145-140-45/events.out.tfevents.1671044964.132-145-140-45.1598466.0 +3 -0
training_args.bin +1 -1

.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb CHANGED Viewed

@@ -226,7 +226,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Reading metadata...: 230467it [00:04, 56208.39it/s]\n"
      ]
     },
     {
@@ -289,7 +289,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
    "metadata": {},
    "outputs": [],
@@ -333,14 +333,113 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
    "metadata": {},
-   "outputs": [],
    "source": [
     "from transformers import WhisperProcessor\n",
     "\n",
-    "processor = WhisperProcessor.from_pretrained(\"juancopi81/whisper-medium-es\", language=\"Spanish\", task=\"transcribe\")"
    ]
   },
   {
@@ -361,7 +460,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
    "metadata": {},
    "outputs": [
@@ -372,7 +471,7 @@
        " 'sentence': Value(dtype='string', id=None)}"
       ]
      },
-     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -398,7 +497,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
    "metadata": {},
    "outputs": [],
@@ -418,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
    "metadata": {},
    "outputs": [],
@@ -445,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
    "metadata": {},
    "outputs": [],
@@ -481,7 +580,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
    "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
    "metadata": {},
    "outputs": [],
@@ -499,7 +598,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
    "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
    "metadata": {},
    "outputs": [],
@@ -520,7 +619,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
    "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
    "metadata": {},
    "outputs": [],
@@ -541,7 +640,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
    "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
    "metadata": {},
    "outputs": [],
@@ -611,7 +710,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
    "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
    "metadata": {},
    "outputs": [],
@@ -659,7 +758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
    "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
    "metadata": {},
    "outputs": [],
@@ -686,7 +785,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
    "id": "b22b4011-f31f-4b57-b684-c52332f92890",
    "metadata": {},
    "outputs": [],
@@ -715,7 +814,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
    "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
    "metadata": {},
    "outputs": [],
@@ -765,14 +864,70 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
    "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
    "metadata": {},
-   "outputs": [],
    "source": [
     "from transformers import WhisperForConditionalGeneration\n",
     "\n",
-    "model = WhisperForConditionalGeneration.from_pretrained(\"juancopi81/whisper-medium-es\")"
    ]
   },
   {
@@ -785,7 +940,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
    "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
    "metadata": {},
    "outputs": [],
@@ -814,10 +969,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
    "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
    "metadata": {},
-   "outputs": [],
    "source": [
     "from transformers import Seq2SeqTrainingArguments\n",
     "\n",
@@ -825,7 +988,7 @@
     "    output_dir=\"./\",\n",
     "    per_device_train_batch_size=32,\n",
     "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
-    "    learning_rate=1e-5,\n",
     "    warmup_steps=500,\n",
     "    max_steps=5000,\n",
     "    gradient_checkpointing=True,\n",
@@ -864,7 +1027,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
    "metadata": {},
    "outputs": [],
@@ -893,7 +1056,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
    "id": "d546d7fe-0543-479a-b708-2ebabec19493",
    "metadata": {},
    "outputs": [
@@ -901,7 +1064,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ubuntu/whisper-small-es-common-fleurs/./ is already a clone of https://huggingface.co/juancopi81/whisper-small-es-common-fleurs. Make sure you pull the latest changes with `repo.git_pull()`.\n",
       "max_steps is given, it will override any value given in num_train_epochs\n",
       "Using cuda_amp half precision backend\n"
      ]
@@ -932,7 +1095,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
    "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
    "metadata": {},
    "outputs": [
@@ -978,14 +1141,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
    "id": "ced90915-84df-4538-9034-f6c8c85de2df",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2e4f6ccd07d344d08259008b7485b7db",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1005,7 +1168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
    "metadata": {},
    "outputs": [
@@ -1023,7 +1186,7 @@
       "  Gradient Accumulation steps = 1\n",
       "  Total optimization steps = 5000\n",
       "  Number of trainable parameters = 763857920\n",
-      "Reading metadata...: 230467it [00:04, 49083.73it/s]\n",
       "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     },
@@ -1033,8 +1196,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='3001' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [3001/5000 10:24:07 < 6:56:01, 0.08 it/s, Epoch 0.60/9223372036854775807]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -1048,15 +1211,9 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <td>1000</td>\n",
-       "      <td>0.069400</td>\n",
-       "      <td>0.219434</td>\n",
-       "      <td>6.819422</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2000</td>\n",
-       "      <td>0.033600</td>\n",
-       "      <td>0.209724</td>\n",
-       "      <td>6.755756</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
@@ -1075,8 +1232,8 @@
       "***** Running Evaluation *****\n",
       "  Num examples: Unknown\n",
       "  Batch size = 16\n",
-      "Reading metadata...: 15520it [00:00, 42402.78it/s]\n",
-      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n",
       "Saving model checkpoint to ./checkpoint-1000\n",
       "Configuration saved in ./checkpoint-1000/config.json\n",
       "Model weights saved in ./checkpoint-1000/pytorch_model.bin\n",
@@ -1087,28 +1244,33 @@
       "Feature extractor saved in ./preprocessor_config.json\n",
       "tokenizer config file saved in ./tokenizer_config.json\n",
       "Special tokens file saved in ./special_tokens_map.json\n",
-      "added tokens file saved in ./added_tokens.json\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples: Unknown\n",
-      "  Batch size = 16\n",
-      "Reading metadata...: 15520it [00:00, 27981.68it/s]\n",
-      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n",
-      "Saving model checkpoint to ./checkpoint-2000\n",
-      "Configuration saved in ./checkpoint-2000/config.json\n",
-      "Model weights saved in ./checkpoint-2000/pytorch_model.bin\n",
-      "Feature extractor saved in ./checkpoint-2000/preprocessor_config.json\n",
-      "tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json\n",
-      "Special tokens file saved in ./checkpoint-2000/special_tokens_map.json\n",
-      "added tokens file saved in ./checkpoint-2000/added_tokens.json\n",
-      "Feature extractor saved in ./preprocessor_config.json\n",
-      "tokenizer config file saved in ./tokenizer_config.json\n",
-      "Special tokens file saved in ./special_tokens_map.json\n",
-      "added tokens file saved in ./added_tokens.json\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples: Unknown\n",
-      "  Batch size = 16\n",
-      "Reading metadata...: 15520it [00:00, 72511.74it/s]\n",
-      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     }
    ],
@@ -1147,7 +1309,7 @@
     "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
     "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
     "    \"language\": \"es\",\n",
-    "    \"model_name\": \"Whisper Small Es - Sanchit Gandhi\",  # a 'pretty' name for your model\n",
     "    \"finetuned_from\": \"openai/whisper-small\",\n",
     "    \"tasks\": \"automatic-speech-recognition\",\n",
     "    \"tags\": \"whisper-event\",\n",

      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Reading metadata...: 230467it [00:05, 42062.14it/s]\n"
      ]
     },
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9769d7a9ab1148b8af2bd69abf74d5d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a2b4d68d48d8439096430441c976bd21",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/837 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ceae9b86f1674939b330c81cb34c625a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "715ade22144945178519b742a88828d7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "381fff2e1ffa4331923ca1b4b3dc965d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cf108c742b8431187e1e3494610df3c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "31a51dd942054666b52dce912df102a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from transformers import WhisperProcessor\n",
     "\n",
+    "processor = WhisperProcessor.from_pretrained(\"juancopi81/whisper-medium-es-common-fleurs\", language=\"Spanish\", task=\"transcribe\")"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
    "metadata": {},
    "outputs": [
        " 'sentence': Value(dtype='string', id=None)}"
       ]
      },
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "id": "b22b4011-f31f-4b57-b684-c52332f92890",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--juancopi81--whisper-medium-es-common-fleurs/snapshots/ceeaee568ae1c40f6c1eb6bb1de818ae909f60fd/config.json\n",
+      "Model config WhisperConfig {\n",
+      "  \"_name_or_path\": \"juancopi81/whisper-medium-es\",\n",
+      "  \"activation_dropout\": 0.0,\n",
+      "  \"activation_function\": \"gelu\",\n",
+      "  \"architectures\": [\n",
+      "    \"WhisperForConditionalGeneration\"\n",
+      "  ],\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"begin_suppress_tokens\": [\n",
+      "    220,\n",
+      "    50257\n",
+      "  ],\n",
+      "  \"bos_token_id\": 50257,\n",
+      "  \"d_model\": 1024,\n",
+      "  \"decoder_attention_heads\": 16,\n",
+      "  \"decoder_ffn_dim\": 4096,\n",
+      "  \"decoder_layerdrop\": 0.0,\n",
+      "  \"decoder_layers\": 24,\n",
+      "  \"decoder_start_token_id\": 50258,\n",
+      "  \"dropout\": 0.1,\n",
+      "  \"encoder_attention_heads\": 16,\n",
+      "  \"encoder_ffn_dim\": 4096,\n",
+      "  \"encoder_layerdrop\": 0.0,\n",
+      "  \"encoder_layers\": 24,\n",
+      "  \"eos_token_id\": 50257,\n",
+      "  \"forced_decoder_ids\": null,\n",
+      "  \"init_std\": 0.02,\n",
+      "  \"is_encoder_decoder\": true,\n",
+      "  \"max_length\": 448,\n",
+      "  \"max_source_positions\": 1500,\n",
+      "  \"max_target_positions\": 448,\n",
+      "  \"model_type\": \"whisper\",\n",
+      "  \"num_hidden_layers\": 24,\n",
+      "  \"num_mel_bins\": 80,\n",
+      "  \"pad_token_id\": 50257,\n",
+      "  \"scale_embedding\": false,\n",
+      "  \"suppress_tokens\": [],\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.26.0.dev0\",\n",
+      "  \"use_cache\": false,\n",
+      "  \"vocab_size\": 51865\n",
+      "}\n",
+      "\n",
+      "loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--juancopi81--whisper-medium-es-common-fleurs/snapshots/ceeaee568ae1c40f6c1eb6bb1de818ae909f60fd/pytorch_model.bin\n",
+      "All model checkpoint weights were used when initializing WhisperForConditionalGeneration.\n",
+      "\n",
+      "All the weights of WhisperForConditionalGeneration were initialized from the model checkpoint at juancopi81/whisper-medium-es-common-fleurs.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use WhisperForConditionalGeneration for predictions without further training.\n"
+     ]
+    }
+   ],
    "source": [
     "from transformers import WhisperForConditionalGeneration\n",
     "\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\"juancopi81/whisper-medium-es-common-fleurs\")"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 31,
    "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 32,
    "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "PyTorch: setting up devices\n"
+     ]
+    }
+   ],
    "source": [
     "from transformers import Seq2SeqTrainingArguments\n",
     "\n",
     "    output_dir=\"./\",\n",
     "    per_device_train_batch_size=32,\n",
     "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
+    "    learning_rate=3e-6,\n",
     "    warmup_steps=500,\n",
     "    max_steps=5000,\n",
     "    gradient_checkpointing=True,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 33,
    "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 34,
    "id": "d546d7fe-0543-479a-b708-2ebabec19493",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/home/ubuntu/whisper-medium-es-common-fleurs-5k-10k/./ is already a clone of https://huggingface.co/juancopi81/whisper-medium-es-common-fleurs-5k-10k. Make sure you pull the latest changes with `repo.git_pull()`.\n",
       "max_steps is given, it will override any value given in num_train_epochs\n",
       "Using cuda_amp half precision backend\n"
      ]
   },
   {
    "cell_type": "code",
+   "execution_count": 35,
    "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "id": "ced90915-84df-4538-9034-f6c8c85de2df",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "386d02833fb0467980c51f82505ce44a",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 29,
    "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
    "metadata": {},
    "outputs": [
       "  Gradient Accumulation steps = 1\n",
       "  Total optimization steps = 5000\n",
       "  Number of trainable parameters = 763857920\n",
+      "Reading metadata...: 230467it [00:05, 39424.34it/s]\n",
       "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     },
        "\n",
        "    <div>\n",
        "      \n",
+       "      <progress value='1038' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1038/5000 4:24:08 < 16:50:09, 0.07 it/s, Epoch 0.21/9223372036854775807]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td>1000</td>\n",
+       "      <td>0.096600</td>\n",
+       "      <td>0.234865</td>\n",
+       "      <td>7.640585</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
       "***** Running Evaluation *****\n",
       "  Num examples: Unknown\n",
       "  Batch size = 16\n",
+      "Reading metadata...: 15520it [00:00, 83747.62it/s]\n",
+      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age. If up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n",
       "Saving model checkpoint to ./checkpoint-1000\n",
       "Configuration saved in ./checkpoint-1000/config.json\n",
       "Model weights saved in ./checkpoint-1000/pytorch_model.bin\n",
       "Feature extractor saved in ./preprocessor_config.json\n",
       "tokenizer config file saved in ./tokenizer_config.json\n",
       "Special tokens file saved in ./special_tokens_map.json\n",
+      "added tokens file saved in ./added_tokens.json\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[29], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/transformers/trainer.py:1534\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1529\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m   1531\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m   1532\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[1;32m   1533\u001b[0m )\n\u001b[0;32m-> 1534\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1535\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1536\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1537\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1538\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1539\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/transformers/trainer.py:1756\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1753\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_rng_state(resume_from_checkpoint)\n\u001b[1;32m   1755\u001b[0m step \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m-> 1756\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, inputs \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(epoch_iterator):\n\u001b[1;32m   1757\u001b[0m \n\u001b[1;32m   1758\u001b[0m     \u001b[38;5;66;03m# Skip past any already trained steps if resuming training\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m steps_trained_in_current_epoch \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m   1760\u001b[0m         steps_trained_in_current_epoch \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    626\u001b[0m     \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m    627\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()  \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    631\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    632\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    670\u001b[0m     index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index()  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m    672\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m    673\u001b[0m         data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py:34\u001b[0m, in \u001b[0;36m_IterableDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index:\n\u001b[1;32m     33\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 34\u001b[0m         data\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset_iter\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m     35\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[1;32m     36\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mended \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/formatting/dataset_wrappers/torch_iterable_dataset.py:35\u001b[0m, in \u001b[0;36mTorchIterableDataset.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     33\u001b[0m worker_info \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mget_worker_info()\n\u001b[1;32m     34\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m worker_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:  \u001b[38;5;66;03m# single-process data loading, return the full iterator\u001b[39;00m\n\u001b[0;32m---> 35\u001b[0m     \u001b[38;5;28;01myield from\u001b[39;00m IterableDataset\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m     36\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:  \u001b[38;5;66;03m# in a worker process\u001b[39;00m\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;66;03m# check if there aren't too many workers\u001b[39;00m\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m worker_info\u001b[38;5;241m.\u001b[39mid \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_shards \u001b[38;5;241m<\u001b[39m worker_info\u001b[38;5;241m.\u001b[39mnum_workers:\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/iterable_dataset.py:758\u001b[0m, in \u001b[0;36mIterableDataset.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    757\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 758\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m key, example \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter():\n\u001b[1;32m    759\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures:\n\u001b[1;32m    760\u001b[0m             \u001b[38;5;66;03m# `IterableDataset` automatically fills missing columns with None.\u001b[39;00m\n\u001b[1;32m    761\u001b[0m             \u001b[38;5;66;03m# This is done with `_apply_feature_types`.\u001b[39;00m\n\u001b[1;32m    762\u001b[0m             \u001b[38;5;28;01myield\u001b[39;00m _apply_feature_types(example, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, token_per_repo_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_token_per_repo_id)\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/iterable_dataset.py:748\u001b[0m, in \u001b[0;36mIterableDataset._iter\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    746\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    747\u001b[0m     ex_iterable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ex_iterable\n\u001b[0;32m--> 748\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m ex_iterable\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/iterable_dataset.py:515\u001b[0m, in \u001b[0;36mFilteredExamplesIterable.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    513\u001b[0m         current_idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m batch_idx \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    514\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 515\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m key, example \u001b[38;5;129;01min\u001b[39;00m iterator:\n\u001b[1;32m    516\u001b[0m         \u001b[38;5;66;03m# If not batched, we can apply the filtering function direcly\u001b[39;00m\n\u001b[1;32m    517\u001b[0m         inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(example)\n\u001b[1;32m    518\u001b[0m         function_args \u001b[38;5;241m=\u001b[39m [inputs] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m [inputs[col] \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_columns]\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/iterable_dataset.py:570\u001b[0m, in \u001b[0;36mBufferShuffledExamplesIterable.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    568\u001b[0m \u001b[38;5;66;03m# this is the shuffle buffer that we keep in memory\u001b[39;00m\n\u001b[1;32m    569\u001b[0m mem_buffer \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m--> 570\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mex_iterable:\n\u001b[1;32m    571\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mem_buffer) \u001b[38;5;241m==\u001b[39m buffer_size:  \u001b[38;5;66;03m# if the buffer is full, pick and example from it\u001b[39;00m\n\u001b[1;32m    572\u001b[0m         i \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(indices_iterator)\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/datasets/iterable_dataset.py:433\u001b[0m, in \u001b[0;36mMappedExamplesIterable.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    431\u001b[0m     function_args\u001b[38;5;241m.\u001b[39mappend(current_idx)\n\u001b[1;32m    432\u001b[0m transformed_example \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(example)  \u001b[38;5;66;03m# this will be updated with the function output\u001b[39;00m\n\u001b[0;32m--> 433\u001b[0m transformed_example\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfunction_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    434\u001b[0m \u001b[38;5;66;03m# then we remove the unwanted columns\u001b[39;00m\n\u001b[1;32m    435\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mremove_columns:\n",
+      "Cell \u001b[0;32mIn[13], line 6\u001b[0m, in \u001b[0;36mprepare_dataset\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m      3\u001b[0m audio \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m      5\u001b[0m \u001b[38;5;66;03m# compute log-Mel input features from input audio array \u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_features\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msampling_rate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39minput_features[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m# compute input length of audio sample in seconds\u001b[39;00m\n\u001b[1;32m      8\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(audio[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m]) \u001b[38;5;241m/\u001b[39m audio[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msampling_rate\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/transformers/models/whisper/feature_extraction_whisper.py:314\u001b[0m, in \u001b[0;36mWhisperFeatureExtractor.__call__\u001b[0;34m(self, raw_speech, truncation, pad_to_multiple_of, return_tensors, return_attention_mask, padding, max_length, sampling_rate, **kwargs)\u001b[0m\n\u001b[1;32m    311\u001b[0m \u001b[38;5;66;03m# make sure list is in array format\u001b[39;00m\n\u001b[1;32m    312\u001b[0m input_features \u001b[38;5;241m=\u001b[39m padded_inputs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_features\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 314\u001b[0m input_features \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_np_extract_fbank_features(waveform) \u001b[38;5;28;01mfor\u001b[39;00m waveform \u001b[38;5;129;01min\u001b[39;00m input_features[\u001b[38;5;241m0\u001b[39m]]\n\u001b[1;32m    316\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(input_features[\u001b[38;5;241m0\u001b[39m], List):\n\u001b[1;32m    317\u001b[0m     padded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_features\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [np\u001b[38;5;241m.\u001b[39masarray(feature, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat32) \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m input_features]\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/transformers/models/whisper/feature_extraction_whisper.py:314\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    311\u001b[0m \u001b[38;5;66;03m# make sure list is in array format\u001b[39;00m\n\u001b[1;32m    312\u001b[0m input_features \u001b[38;5;241m=\u001b[39m padded_inputs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_features\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 314\u001b[0m input_features \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_np_extract_fbank_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwaveform\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m waveform \u001b[38;5;129;01min\u001b[39;00m input_features[\u001b[38;5;241m0\u001b[39m]]\n\u001b[1;32m    316\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(input_features[\u001b[38;5;241m0\u001b[39m], List):\n\u001b[1;32m    317\u001b[0m     padded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_features\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [np\u001b[38;5;241m.\u001b[39masarray(feature, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mfloat32) \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m input_features]\n",
+      "File \u001b[0;32m~/hf_env/lib/python3.8/site-packages/transformers/models/whisper/feature_extraction_whisper.py:207\u001b[0m, in \u001b[0;36mWhisperFeatureExtractor._np_extract_fbank_features\u001b[0;34m(self, waveform)\u001b[0m\n\u001b[1;32m    205\u001b[0m frames \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfram_wave(waveform)\n\u001b[1;32m    206\u001b[0m stft \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstft(frames, window\u001b[38;5;241m=\u001b[39mwindow)\n\u001b[0;32m--> 207\u001b[0m magnitudes \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mabs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstft\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m\n\u001b[1;32m    209\u001b[0m filters \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmel_filters\n\u001b[1;32m    210\u001b[0m mel_spec \u001b[38;5;241m=\u001b[39m filters \u001b[38;5;241m@\u001b[39m magnitudes\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
     "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
     "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
     "    \"language\": \"es\",\n",
+    "    \"model_name\": \"Whisper Mediuem Es - Juan Pineros\",  # a 'pretty' name for your model\n",
     "    \"finetuned_from\": \"openai/whisper-small\",\n",
     "    \"tasks\": \"automatic-speech-recognition\",\n",
     "    \"tags\": \"whisper-event\",\n",

fine-tune-whisper-streaming.ipynb CHANGED Viewed

@@ -226,7 +226,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Reading metadata...: 230467it [00:05, 42062.14it/s]\n"
      ]
     },
     {
@@ -336,106 +336,7 @@
    "execution_count": 9,
    "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9769d7a9ab1148b8af2bd69abf74d5d6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a2b4d68d48d8439096430441c976bd21",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/837 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ceae9b86f1674939b330c81cb34c625a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "715ade22144945178519b742a88828d7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "381fff2e1ffa4331923ca1b4b3dc965d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7cf108c742b8431187e1e3494610df3c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "31a51dd942054666b52dce912df102a3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
    "source": [
     "from transformers import WhisperProcessor\n",
     "\n",
@@ -867,36 +768,7 @@
    "execution_count": 22,
    "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3b21514a2fff4878a2f569d2cc28b925",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "28d70b74dbd844328ad9d325c9babfe1",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/3.06G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
    "source": [
     "from transformers import WhisperForConditionalGeneration\n",
     "\n",
@@ -920,7 +792,8 @@
    "source": [
     "model.config.forced_decoder_ids = None\n",
     "model.config.suppress_tokens = []\n",
-    "model.config.use_cache = False"
    ]
   },
   {
@@ -952,7 +825,7 @@
     "    output_dir=\"./\",\n",
     "    per_device_train_batch_size=32,\n",
     "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
-    "    learning_rate=1e-5,\n",
     "    warmup_steps=500,\n",
     "    max_steps=5000,\n",
     "    gradient_checkpointing=True,\n",
@@ -1112,7 +985,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "386d02833fb0467980c51f82505ce44a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1150,7 +1023,7 @@
       "  Gradient Accumulation steps = 1\n",
       "  Total optimization steps = 5000\n",
       "  Number of trainable parameters = 763857920\n",
-      "Reading metadata...: 230467it [00:05, 39424.34it/s]\n",
       "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     },
@@ -1161,7 +1034,7 @@
        "    <div>\n",
        "      \n",
        "      <progress value='1001' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [1001/5000 1:45:40 < 7:03:01, 0.16 it/s, Epoch 0.20/9223372036854775807]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -1189,8 +1062,8 @@
       "***** Running Evaluation *****\n",
       "  Num examples: Unknown\n",
       "  Batch size = 16\n",
-      "Reading metadata...: 15520it [00:00, 83747.62it/s]\n",
-      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age. If up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     }
    ],

      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Reading metadata...: 230467it [00:05, 45342.52it/s]\n"
      ]
     },
     {
    "execution_count": 9,
    "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
    "metadata": {},
+   "outputs": [],
    "source": [
     "from transformers import WhisperProcessor\n",
     "\n",
    "execution_count": 22,
    "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
    "metadata": {},
+   "outputs": [],
    "source": [
     "from transformers import WhisperForConditionalGeneration\n",
     "\n",
    "source": [
     "model.config.forced_decoder_ids = None\n",
     "model.config.suppress_tokens = []\n",
+    "model.config.use_cache = False\n",
+    "model.config.dropout = 0.1"
    ]
   },
   {
     "    output_dir=\"./\",\n",
     "    per_device_train_batch_size=32,\n",
     "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
+    "    learning_rate=3e-6,\n",
     "    warmup_steps=500,\n",
     "    max_steps=5000,\n",
     "    gradient_checkpointing=True,\n",
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dca83cda148e49d9ba1b129e3b58fc2f",
        "version_major": 2,
        "version_minor": 0
       },
       "  Gradient Accumulation steps = 1\n",
       "  Total optimization steps = 5000\n",
       "  Number of trainable parameters = 763857920\n",
+      "Reading metadata...: 230467it [00:02, 96908.84it/s] \n",
       "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     },
        "    <div>\n",
        "      \n",
        "      <progress value='1001' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1001/5000 1:45:29 < 7:02:15, 0.16 it/s, Epoch 0.20/9223372036854775807]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
       "***** Running Evaluation *****\n",
       "  Num examples: Unknown\n",
       "  Batch size = 16\n",
+      "Reading metadata...: 15520it [00:00, 92814.18it/s]\n",
+      "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: accent, up_votes, locale, age, input_length, path, client_id, segment, gender, down_votes. If accent, up_votes, locale, age, input_length, path, client_id, segment, gender, down_votes are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
      ]
     }
    ],

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a3249e6e15f570c9d0efc91ec99b7441c886cb12122c58a274ac6a1822c3b08
 size 3055754841

 version https://git-lfs.github.com/spec/v1
+oid sha256:a05aa841841d192dfcf9039758a9124e7e20b2ab8da5125aa82332fa0c718563
 size 3055754841

runs/Dec14_14-23-12_132-145-140-45/events.out.tfevents.1671027857.132-145-140-45.618344.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cfcadc9139e00668085bff5587670f600edc27ed2cc2099a36aa9ace07a80d2
-size 10894

 version https://git-lfs.github.com/spec/v1
+oid sha256:6b3b7c997ea40e8c65277496ae69ac65cd1c79706f4a1c18a8f918fee2054fa7
+size 11051

runs/Dec14_18-54-17_132-145-140-45/1671044156.1678598/events.out.tfevents.1671044156.132-145-140-45.618344.3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b96cb454b363871ef431445ee594bfa3a4fa9edb374fc5234bb77e2b891d7a8c
+size 5864

runs/Dec14_18-54-17_132-145-140-45/events.out.tfevents.1671044156.132-145-140-45.618344.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62ef8b9cb3665f837bea393f7fcc3a3b0684bd35babe5d5ff95e9699a630214c
+size 4311

runs/Dec14_19-08-48_132-145-140-45/1671044964.476709/events.out.tfevents.1671044964.132-145-140-45.1598466.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:009cd0e80b8ba9c5683708fa63d4b2b983bbfb624c96ef59a3b566b435a7c7da
+size 5864

runs/Dec14_19-08-48_132-145-140-45/events.out.tfevents.1671044964.132-145-140-45.1598466.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65ca4c46fb88d750e940405304e4a532d88a53b1800c33ac9058e5f787380b8f
+size 10894

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cba2e3c972f6ddedbda56d25f4cd1efc0f88bae273d74a2256414dc2a071f223
 size 3579

 version https://git-lfs.github.com/spec/v1
+oid sha256:87c7fbec30a355ab216bff56aaaf037f4afe0c75cf7ed4d9ff39c96f4dbfee91
 size 3579