jimregan commited on
Commit
6834949
·
1 Parent(s): 105571d

add notebooks from Kaggle

Browse files
kaggle-notebooks/cmu-us-awb-arctic-fairseq-files.ipynb ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "bddfd111",
7
+ "metadata": {
8
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10
+ "execution": {
11
+ "iopub.execute_input": "2022-05-07T08:23:49.094601Z",
12
+ "iopub.status.busy": "2022-05-07T08:23:49.094098Z",
13
+ "iopub.status.idle": "2022-05-07T08:23:49.103907Z",
14
+ "shell.execute_reply": "2022-05-07T08:23:49.103002Z"
15
+ },
16
+ "papermill": {
17
+ "duration": 0.026388,
18
+ "end_time": "2022-05-07T08:23:49.105961",
19
+ "exception": false,
20
+ "start_time": "2022-05-07T08:23:49.079573",
21
+ "status": "completed"
22
+ },
23
+ "tags": []
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "RAWTEXT = \"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/etc/txt.done.data\""
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "id": "146a024b",
34
+ "metadata": {
35
+ "execution": {
36
+ "iopub.execute_input": "2022-05-07T08:23:49.130710Z",
37
+ "iopub.status.busy": "2022-05-07T08:23:49.130417Z",
38
+ "iopub.status.idle": "2022-05-07T08:23:49.135862Z",
39
+ "shell.execute_reply": "2022-05-07T08:23:49.134985Z"
40
+ },
41
+ "papermill": {
42
+ "duration": 0.020182,
43
+ "end_time": "2022-05-07T08:23:49.138041",
44
+ "exception": false,
45
+ "start_time": "2022-05-07T08:23:49.117859",
46
+ "status": "completed"
47
+ },
48
+ "tags": []
49
+ },
50
+ "outputs": [],
51
+ "source": [
52
+ "NORMS = {\n",
53
+ " \"0.75\": \"zero point seven five\",\n",
54
+ " \"t.h\": \"t h\",\n",
55
+ " \"1880\": \"eighteen eighty\",\n",
56
+ " \"16\": \"sixteenth\",\n",
57
+ " \"1908\": \"nineteen oh eight\",\n",
58
+ " \"18\": \"eighteenth\",\n",
59
+ " \"17\": \"seventeenth\",\n",
60
+ " \"29th\": \"twenty ninth\",\n",
61
+ " \"mrs\": \"misses\",\n",
62
+ " \"etc\": \"etcetera\",\n",
63
+ " \"etc.\": \"etcetera\",\n",
64
+ " \"to-day\": \"today\",\n",
65
+ " \"to-day's\": \"today's\",\n",
66
+ " \"to-morrow\": \"tomorrow\"\n",
67
+ "}"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 3,
73
+ "id": "46c0c682",
74
+ "metadata": {
75
+ "execution": {
76
+ "iopub.execute_input": "2022-05-07T08:23:49.163049Z",
77
+ "iopub.status.busy": "2022-05-07T08:23:49.162461Z",
78
+ "iopub.status.idle": "2022-05-07T08:23:49.170619Z",
79
+ "shell.execute_reply": "2022-05-07T08:23:49.169730Z"
80
+ },
81
+ "papermill": {
82
+ "duration": 0.022915,
83
+ "end_time": "2022-05-07T08:23:49.172592",
84
+ "exception": false,
85
+ "start_time": "2022-05-07T08:23:49.149677",
86
+ "status": "completed"
87
+ },
88
+ "tags": []
89
+ },
90
+ "outputs": [],
91
+ "source": [
92
+ "def _check_apos(word):\n",
93
+ " if word.endswith(\"'s\"):\n",
94
+ " return word\n",
95
+ " elif word.endswith(\"s'\"):\n",
96
+ " return word\n",
97
+ " elif word.endswith(\"'d\"):\n",
98
+ " return word\n",
99
+ " elif word.endswith(\"'ve\"):\n",
100
+ " return word\n",
101
+ " elif word.endswith(\"'re\"):\n",
102
+ " return word\n",
103
+ " elif word.endswith(\"'ll\"):\n",
104
+ " return word\n",
105
+ " elif word.endswith(\"n't\"):\n",
106
+ " return word\n",
107
+ " elif word.endswith(\"'ve\"):\n",
108
+ " return word\n",
109
+ " elif word in [\"i'm\", \"'em\", \"o'brien\"]:\n",
110
+ " return word\n",
111
+ " else:\n",
112
+ " return word.replace(\"'\", \"\")\n",
113
+ "\n",
114
+ "def fix_apos(text):\n",
115
+ " words = [_check_apos(w) for w in text.split(\" \")]\n",
116
+ " return \" \".join(words)"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 4,
122
+ "id": "ad9ec20c",
123
+ "metadata": {
124
+ "execution": {
125
+ "iopub.execute_input": "2022-05-07T08:23:49.197464Z",
126
+ "iopub.status.busy": "2022-05-07T08:23:49.197091Z",
127
+ "iopub.status.idle": "2022-05-07T08:23:49.206427Z",
128
+ "shell.execute_reply": "2022-05-07T08:23:49.205485Z"
129
+ },
130
+ "papermill": {
131
+ "duration": 0.024203,
132
+ "end_time": "2022-05-07T08:23:49.208449",
133
+ "exception": false,
134
+ "start_time": "2022-05-07T08:23:49.184246",
135
+ "status": "completed"
136
+ },
137
+ "tags": []
138
+ },
139
+ "outputs": [],
140
+ "source": [
141
+ "def normalise(text):\n",
142
+ " if text[-1] == \".\":\n",
143
+ " text = text[:-1]\n",
144
+ " text = text.lower()\n",
145
+ " words = []\n",
146
+ " text = text.replace(\",\", \"\")\n",
147
+ " for word in text.split(\" \"):\n",
148
+ " if word in NORMS:\n",
149
+ " words.append(NORMS[word])\n",
150
+ " else:\n",
151
+ " words.append(word)\n",
152
+ " text = \" \".join(words)\n",
153
+ " text = text.replace(\".\", \"\")\n",
154
+ " text = text.replace(\"?\", \"\")\n",
155
+ " text = text.replace(\"!\", \"\")\n",
156
+ " text = text.replace(\":\", \"\")\n",
157
+ " text = text.replace(\";\", \"\")\n",
158
+ " text = text.replace(\"--\", \" \")\n",
159
+ " text = text.replace(\" \", \" \")\n",
160
+ " text = text.replace(\" - \", \" \")\n",
161
+ " text = text.replace(\"to- morrow\", \"tomorrow\")\n",
162
+ " text = fix_apos(text)\n",
163
+ " text = text.replace(\"-\", \" \")\n",
164
+ " return text.strip().upper()"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 5,
170
+ "id": "aec22c8a",
171
+ "metadata": {
172
+ "execution": {
173
+ "iopub.execute_input": "2022-05-07T08:23:49.233246Z",
174
+ "iopub.status.busy": "2022-05-07T08:23:49.232952Z",
175
+ "iopub.status.idle": "2022-05-07T08:23:49.275141Z",
176
+ "shell.execute_reply": "2022-05-07T08:23:49.274337Z"
177
+ },
178
+ "papermill": {
179
+ "duration": 0.057432,
180
+ "end_time": "2022-05-07T08:23:49.277432",
181
+ "exception": false,
182
+ "start_time": "2022-05-07T08:23:49.220000",
183
+ "status": "completed"
184
+ },
185
+ "tags": []
186
+ },
187
+ "outputs": [],
188
+ "source": [
189
+ "data = {}\n",
190
+ "with open(RAWTEXT) as inf:\n",
191
+ " for line in inf.readlines():\n",
192
+ " first_space = line.find(' ')\n",
193
+ " first_quote = line.find('\"')\n",
194
+ " last_quote = line.rfind('\"')\n",
195
+ " id = line[first_space+1:first_quote].strip()\n",
196
+ " text = line[first_quote+1:last_quote]\n",
197
+ " data[id] = normalise(text)"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 6,
203
+ "id": "c4d12ef6",
204
+ "metadata": {
205
+ "execution": {
206
+ "iopub.execute_input": "2022-05-07T08:23:49.302618Z",
207
+ "iopub.status.busy": "2022-05-07T08:23:49.302155Z",
208
+ "iopub.status.idle": "2022-05-07T08:23:49.306841Z",
209
+ "shell.execute_reply": "2022-05-07T08:23:49.306240Z"
210
+ },
211
+ "papermill": {
212
+ "duration": 0.019827,
213
+ "end_time": "2022-05-07T08:23:49.308829",
214
+ "exception": false,
215
+ "start_time": "2022-05-07T08:23:49.289002",
216
+ "status": "completed"
217
+ },
218
+ "tags": []
219
+ },
220
+ "outputs": [],
221
+ "source": [
222
+ "with open(\"text.tsv\", \"w\") as of:\n",
223
+ " for id in data.keys():\n",
224
+ " of.write(f\"{id}\\t{data[id]}\\n\")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 7,
230
+ "id": "62dc9132",
231
+ "metadata": {
232
+ "execution": {
233
+ "iopub.execute_input": "2022-05-07T08:23:49.333170Z",
234
+ "iopub.status.busy": "2022-05-07T08:23:49.332878Z",
235
+ "iopub.status.idle": "2022-05-07T08:23:58.857125Z",
236
+ "shell.execute_reply": "2022-05-07T08:23:58.855696Z"
237
+ },
238
+ "papermill": {
239
+ "duration": 9.539562,
240
+ "end_time": "2022-05-07T08:23:58.859914",
241
+ "exception": false,
242
+ "start_time": "2022-05-07T08:23:49.320352",
243
+ "status": "completed"
244
+ },
245
+ "tags": []
246
+ },
247
+ "outputs": [
248
+ {
249
+ "name": "stdout",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "Total: 4777.0\n"
253
+ ]
254
+ }
255
+ ],
256
+ "source": [
257
+ "from pathlib import Path\n",
258
+ "import soundfile as sf\n",
259
+ "\n",
260
+ "total = 0\n",
261
+ "WAVPATH = Path(\"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/wav/\")\n",
262
+ "with open(\"frames.tsv\", \"w\") as of:\n",
263
+ " for wav in WAVPATH.glob(\"*.wav\"):\n",
264
+ " frames, sr = sf.read(str(wav))\n",
265
+ " assert sr == 16000\n",
266
+ " total += len(frames)\n",
267
+ " of.write(f\"{wav.stem}.wav\\t{len(frames)}\\n\")\n",
268
+ "print(\"Total:\", total / 16000)"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 8,
274
+ "id": "71a8a65f",
275
+ "metadata": {
276
+ "execution": {
277
+ "iopub.execute_input": "2022-05-07T08:23:58.896455Z",
278
+ "iopub.status.busy": "2022-05-07T08:23:58.895715Z",
279
+ "iopub.status.idle": "2022-05-07T08:24:01.181186Z",
280
+ "shell.execute_reply": "2022-05-07T08:24:01.179981Z"
281
+ },
282
+ "papermill": {
283
+ "duration": 2.308171,
284
+ "end_time": "2022-05-07T08:24:01.183774",
285
+ "exception": false,
286
+ "start_time": "2022-05-07T08:23:58.875603",
287
+ "status": "completed"
288
+ },
289
+ "tags": []
290
+ },
291
+ "outputs": [],
292
+ "source": [
293
+ "lines=!wc -l frames.tsv|awk '{print $1}'\n",
294
+ "!tail -n 114 frames.tsv |head -n 57 > test.tsv\n",
295
+ "!tail -n 114 frames.tsv |tail -n 57 > dev.tsv\n",
296
+ "!head -n $((1138-114)) frames.tsv > train.tsv"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 9,
302
+ "id": "6c7413f1",
303
+ "metadata": {
304
+ "execution": {
305
+ "iopub.execute_input": "2022-05-07T08:24:01.210537Z",
306
+ "iopub.status.busy": "2022-05-07T08:24:01.209625Z",
307
+ "iopub.status.idle": "2022-05-07T08:24:01.215252Z",
308
+ "shell.execute_reply": "2022-05-07T08:24:01.214668Z"
309
+ },
310
+ "papermill": {
311
+ "duration": 0.021169,
312
+ "end_time": "2022-05-07T08:24:01.217174",
313
+ "exception": false,
314
+ "start_time": "2022-05-07T08:24:01.196005",
315
+ "status": "completed"
316
+ },
317
+ "tags": []
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "def do_fairseq(text):\n",
322
+ " words = text.split(\" \")\n",
323
+ " owords = [\" \".join(w) for w in words]\n",
324
+ " return \" | \".join(owords) + \" |\""
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 10,
330
+ "id": "9342e47e",
331
+ "metadata": {
332
+ "execution": {
333
+ "iopub.execute_input": "2022-05-07T08:24:01.242716Z",
334
+ "iopub.status.busy": "2022-05-07T08:24:01.242299Z",
335
+ "iopub.status.idle": "2022-05-07T08:24:01.257797Z",
336
+ "shell.execute_reply": "2022-05-07T08:24:01.257054Z"
337
+ },
338
+ "papermill": {
339
+ "duration": 0.031049,
340
+ "end_time": "2022-05-07T08:24:01.260092",
341
+ "exception": false,
342
+ "start_time": "2022-05-07T08:24:01.229043",
343
+ "status": "completed"
344
+ },
345
+ "tags": []
346
+ },
347
+ "outputs": [],
348
+ "source": [
349
+ "for part in [\"test\", \"train\", \"dev\"]:\n",
350
+ " ids = []\n",
351
+ " with open(f\"{part}.ltr\", \"w\") as of, open(f\"{part}.tsv\") as inf:\n",
352
+ " for line in inf.readlines():\n",
353
+ " if \"\\t\" in line:\n",
354
+ " parts = line.strip().split(\"\\t\")\n",
355
+ " id = parts[0].replace(\".wav\", \"\")\n",
356
+ " of.write(do_fairseq(data[id]) + \"\\n\")"
357
+ ]
358
+ }
359
+ ],
360
+ "metadata": {
361
+ "kernelspec": {
362
+ "display_name": "Python 3",
363
+ "language": "python",
364
+ "name": "python3"
365
+ },
366
+ "language_info": {
367
+ "codemirror_mode": {
368
+ "name": "ipython",
369
+ "version": 3
370
+ },
371
+ "file_extension": ".py",
372
+ "mimetype": "text/x-python",
373
+ "name": "python",
374
+ "nbconvert_exporter": "python",
375
+ "pygments_lexer": "ipython3",
376
+ "version": "3.7.12"
377
+ },
378
+ "papermill": {
379
+ "default_parameters": {},
380
+ "duration": 23.168986,
381
+ "end_time": "2022-05-07T08:24:01.994036",
382
+ "environment_variables": {},
383
+ "exception": null,
384
+ "input_path": "__notebook__.ipynb",
385
+ "output_path": "__notebook__.ipynb",
386
+ "parameters": {},
387
+ "start_time": "2022-05-07T08:23:38.825050",
388
+ "version": "2.3.4"
389
+ }
390
+ },
391
+ "nbformat": 4,
392
+ "nbformat_minor": 5
393
+ }
kaggle-notebooks/create-awb-splits.ipynb ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d250a6ca",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.015615,
9
+ "end_time": "2022-05-07T20:21:51.320467",
10
+ "exception": false,
11
+ "start_time": "2022-05-07T20:21:51.304852",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# Load frame lengths"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 1,
23
+ "id": "c67326d6",
24
+ "metadata": {
25
+ "execution": {
26
+ "iopub.execute_input": "2022-05-07T20:21:51.351926Z",
27
+ "iopub.status.busy": "2022-05-07T20:21:51.351146Z",
28
+ "iopub.status.idle": "2022-05-07T20:21:51.375423Z",
29
+ "shell.execute_reply": "2022-05-07T20:21:51.374683Z"
30
+ },
31
+ "papermill": {
32
+ "duration": 0.042741,
33
+ "end_time": "2022-05-07T20:21:51.377820",
34
+ "exception": false,
35
+ "start_time": "2022-05-07T20:21:51.335079",
36
+ "status": "completed"
37
+ },
38
+ "tags": []
39
+ },
40
+ "outputs": [],
41
+ "source": [
42
+ "train_frames = {}\n",
43
+ "\n",
44
+ "total = 0\n",
45
+ "with open(\"../input/cmu-us-awb-arctic-fairseq-files/train.tsv\") as f:\n",
46
+ " for line in f.readlines():\n",
47
+ " if not \"\\t\" in line:\n",
48
+ " continue\n",
49
+ " pieces = line.strip().split(\"\\t\")\n",
50
+ " total += int(pieces[1])\n",
51
+ " assert len(pieces) == 2\n",
52
+ " id = pieces[0].replace(\".wav\", \"\")\n",
53
+ " train_frames[id] = int(pieces[1])"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 2,
59
+ "id": "b7994029",
60
+ "metadata": {
61
+ "execution": {
62
+ "iopub.execute_input": "2022-05-07T20:21:51.408607Z",
63
+ "iopub.status.busy": "2022-05-07T20:21:51.408333Z",
64
+ "iopub.status.idle": "2022-05-07T20:21:51.415606Z",
65
+ "shell.execute_reply": "2022-05-07T20:21:51.414918Z"
66
+ },
67
+ "papermill": {
68
+ "duration": 0.025832,
69
+ "end_time": "2022-05-07T20:21:51.418472",
70
+ "exception": false,
71
+ "start_time": "2022-05-07T20:21:51.392640",
72
+ "status": "completed"
73
+ },
74
+ "tags": []
75
+ },
76
+ "outputs": [
77
+ {
78
+ "data": {
79
+ "text/plain": [
80
+ "4295.0"
81
+ ]
82
+ },
83
+ "execution_count": 2,
84
+ "metadata": {},
85
+ "output_type": "execute_result"
86
+ }
87
+ ],
88
+ "source": [
89
+ "total / 16000"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 3,
95
+ "id": "3f849d42",
96
+ "metadata": {
97
+ "execution": {
98
+ "iopub.execute_input": "2022-05-07T20:21:51.453347Z",
99
+ "iopub.status.busy": "2022-05-07T20:21:51.452190Z",
100
+ "iopub.status.idle": "2022-05-07T20:21:51.458730Z",
101
+ "shell.execute_reply": "2022-05-07T20:21:51.457854Z"
102
+ },
103
+ "papermill": {
104
+ "duration": 0.025537,
105
+ "end_time": "2022-05-07T20:21:51.461052",
106
+ "exception": false,
107
+ "start_time": "2022-05-07T20:21:51.435515",
108
+ "status": "completed"
109
+ },
110
+ "tags": []
111
+ },
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "71.58333333333333"
117
+ ]
118
+ },
119
+ "execution_count": 3,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "4295 / 60"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 4,
131
+ "id": "9f9a6faf",
132
+ "metadata": {
133
+ "execution": {
134
+ "iopub.execute_input": "2022-05-07T20:21:51.493604Z",
135
+ "iopub.status.busy": "2022-05-07T20:21:51.492779Z",
136
+ "iopub.status.idle": "2022-05-07T20:21:51.497692Z",
137
+ "shell.execute_reply": "2022-05-07T20:21:51.497031Z"
138
+ },
139
+ "papermill": {
140
+ "duration": 0.023598,
141
+ "end_time": "2022-05-07T20:21:51.499886",
142
+ "exception": false,
143
+ "start_time": "2022-05-07T20:21:51.476288",
144
+ "status": "completed"
145
+ },
146
+ "tags": []
147
+ },
148
+ "outputs": [],
149
+ "source": [
150
+ "MINS = [i * 5 for i in range(1, 13)]\n"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 5,
156
+ "id": "48ffe86d",
157
+ "metadata": {
158
+ "execution": {
159
+ "iopub.execute_input": "2022-05-07T20:21:51.532980Z",
160
+ "iopub.status.busy": "2022-05-07T20:21:51.532352Z",
161
+ "iopub.status.idle": "2022-05-07T20:21:51.539573Z",
162
+ "shell.execute_reply": "2022-05-07T20:21:51.538702Z"
163
+ },
164
+ "papermill": {
165
+ "duration": 0.026147,
166
+ "end_time": "2022-05-07T20:21:51.541893",
167
+ "exception": false,
168
+ "start_time": "2022-05-07T20:21:51.515746",
169
+ "status": "completed"
170
+ },
171
+ "tags": []
172
+ },
173
+ "outputs": [
174
+ {
175
+ "data": {
176
+ "text/plain": [
177
+ "[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]"
178
+ ]
179
+ },
180
+ "execution_count": 5,
181
+ "metadata": {},
182
+ "output_type": "execute_result"
183
+ }
184
+ ],
185
+ "source": [
186
+ "MINS"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 6,
192
+ "id": "c2190118",
193
+ "metadata": {
194
+ "execution": {
195
+ "iopub.execute_input": "2022-05-07T20:21:51.575902Z",
196
+ "iopub.status.busy": "2022-05-07T20:21:51.575081Z",
197
+ "iopub.status.idle": "2022-05-07T20:21:51.579726Z",
198
+ "shell.execute_reply": "2022-05-07T20:21:51.578698Z"
199
+ },
200
+ "papermill": {
201
+ "duration": 0.02397,
202
+ "end_time": "2022-05-07T20:21:51.581997",
203
+ "exception": false,
204
+ "start_time": "2022-05-07T20:21:51.558027",
205
+ "status": "completed"
206
+ },
207
+ "tags": []
208
+ },
209
+ "outputs": [],
210
+ "source": [
211
+ "WAVDIR = \"/kaggle/input/ljspeech-for-asr/wav16\""
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "markdown",
216
+ "id": "0dd1b778",
217
+ "metadata": {
218
+ "papermill": {
219
+ "duration": 0.016062,
220
+ "end_time": "2022-05-07T20:21:51.613958",
221
+ "exception": false,
222
+ "start_time": "2022-05-07T20:21:51.597896",
223
+ "status": "completed"
224
+ },
225
+ "tags": []
226
+ },
227
+ "source": [
228
+ "# Minute splits"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 7,
234
+ "id": "b39e450f",
235
+ "metadata": {
236
+ "execution": {
237
+ "iopub.execute_input": "2022-05-07T20:21:51.647074Z",
238
+ "iopub.status.busy": "2022-05-07T20:21:51.646766Z",
239
+ "iopub.status.idle": "2022-05-07T20:21:51.665170Z",
240
+ "shell.execute_reply": "2022-05-07T20:21:51.664195Z"
241
+ },
242
+ "papermill": {
243
+ "duration": 0.03809,
244
+ "end_time": "2022-05-07T20:21:51.667729",
245
+ "exception": false,
246
+ "start_time": "2022-05-07T20:21:51.629639",
247
+ "status": "completed"
248
+ },
249
+ "tags": []
250
+ },
251
+ "outputs": [],
252
+ "source": [
253
+ "for min in MINS:\n",
254
+ " frames = min * 60 * 16000\n",
255
+ " idlist = [k for k in train_frames.keys()]\n",
256
+ " outtsv = f\"{min}mins.tsv\"\n",
257
+ " with open(outtsv, \"w\") as of:\n",
258
+ " current = 0\n",
259
+ " of.write(f\"{WAVDIR}\\n\")\n",
260
+ " while frames > 0 and frames > current:\n",
261
+ " id = idlist.pop(0)\n",
262
+ " current = train_frames[id]\n",
263
+ " of.write(f\"{id}.wav\\t{current}\\n\")\n",
264
+ " frames = frames - current\n",
265
+ " max = 0\n",
266
+ " maxid = \"\"\n",
267
+ " for id in idlist:\n",
268
+ " time = train_frames[id]\n",
269
+ " if time > current:\n",
270
+ " continue\n",
271
+ " if time > max:\n",
272
+ " max = time\n",
273
+ " maxid = id\n",
274
+ " of.write(f\"{maxid}.wav\\t{max}\\n\")"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "markdown",
279
+ "id": "f4127c0d",
280
+ "metadata": {
281
+ "papermill": {
282
+ "duration": 0.016089,
283
+ "end_time": "2022-05-07T20:21:51.699865",
284
+ "exception": false,
285
+ "start_time": "2022-05-07T20:21:51.683776",
286
+ "status": "completed"
287
+ },
288
+ "tags": []
289
+ },
290
+ "source": [
291
+ "# Generate `ltr` files"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 8,
297
+ "id": "df62fef6",
298
+ "metadata": {
299
+ "execution": {
300
+ "iopub.execute_input": "2022-05-07T20:21:51.734361Z",
301
+ "iopub.status.busy": "2022-05-07T20:21:51.734065Z",
302
+ "iopub.status.idle": "2022-05-07T20:21:51.739428Z",
303
+ "shell.execute_reply": "2022-05-07T20:21:51.738516Z"
304
+ },
305
+ "papermill": {
306
+ "duration": 0.024729,
307
+ "end_time": "2022-05-07T20:21:51.741510",
308
+ "exception": false,
309
+ "start_time": "2022-05-07T20:21:51.716781",
310
+ "status": "completed"
311
+ },
312
+ "tags": []
313
+ },
314
+ "outputs": [],
315
+ "source": [
316
+ "def fairseqify(text):\n",
317
+ " text = text.strip().replace(\" \", \" \")\n",
318
+ " words = text.split(\" \")\n",
319
+ " spread = [\" \".join(a) for a in words]\n",
320
+ " return \" | \".join(spread) + \" |\""
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 9,
326
+ "id": "1f1f0ede",
327
+ "metadata": {
328
+ "execution": {
329
+ "iopub.execute_input": "2022-05-07T20:21:51.775253Z",
330
+ "iopub.status.busy": "2022-05-07T20:21:51.774938Z",
331
+ "iopub.status.idle": "2022-05-07T20:21:51.796186Z",
332
+ "shell.execute_reply": "2022-05-07T20:21:51.795461Z"
333
+ },
334
+ "papermill": {
335
+ "duration": 0.040472,
336
+ "end_time": "2022-05-07T20:21:51.798476",
337
+ "exception": false,
338
+ "start_time": "2022-05-07T20:21:51.758004",
339
+ "status": "completed"
340
+ },
341
+ "tags": []
342
+ },
343
+ "outputs": [],
344
+ "source": [
345
+ "transcripts = {}\n",
346
+ "with open(\"../input/cmu-us-awb-arctic-fairseq-files/text.tsv\") as tf:\n",
347
+ " for line in tf.readlines():\n",
348
+ " line = line.strip()\n",
349
+ " if not \"\\t\" in line:\n",
350
+ " pass\n",
351
+ " parts = line.split(\"\\t\")\n",
352
+ " assert len(parts) == 2\n",
353
+ " transcripts[parts[0]] = fairseqify(parts[1])"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 10,
359
+ "id": "b39c56c7",
360
+ "metadata": {
361
+ "execution": {
362
+ "iopub.execute_input": "2022-05-07T20:21:51.832082Z",
363
+ "iopub.status.busy": "2022-05-07T20:21:51.831792Z",
364
+ "iopub.status.idle": "2022-05-07T20:21:51.848438Z",
365
+ "shell.execute_reply": "2022-05-07T20:21:51.847533Z"
366
+ },
367
+ "papermill": {
368
+ "duration": 0.036232,
369
+ "end_time": "2022-05-07T20:21:51.850791",
370
+ "exception": false,
371
+ "start_time": "2022-05-07T20:21:51.814559",
372
+ "status": "completed"
373
+ },
374
+ "tags": []
375
+ },
376
+ "outputs": [],
377
+ "source": [
378
+ "import glob\n",
379
+ "for tsv in glob.glob(\"*.tsv\"):\n",
380
+ " out = tsv.replace(\".tsv\", \".ltr\")\n",
381
+ " with open(tsv) as inf, open(out, \"w\") as outf:\n",
382
+ " for line in inf.readlines()[1:]:\n",
383
+ " id, _ = line.split(\"\\t\")\n",
384
+ " id = id.replace(\".wav\", \"\")\n",
385
+ " outf.write(f\"{transcripts[id]}\\n\")"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "markdown",
390
+ "id": "b5d1f3dc",
391
+ "metadata": {
392
+ "papermill": {
393
+ "duration": 0.015826,
394
+ "end_time": "2022-05-07T20:21:51.883384",
395
+ "exception": false,
396
+ "start_time": "2022-05-07T20:21:51.867558",
397
+ "status": "completed"
398
+ },
399
+ "tags": []
400
+ },
401
+ "source": [
402
+ "# Tidy up"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": 11,
408
+ "id": "8bf75958",
409
+ "metadata": {
410
+ "execution": {
411
+ "iopub.execute_input": "2022-05-07T20:21:51.917484Z",
412
+ "iopub.status.busy": "2022-05-07T20:21:51.917020Z",
413
+ "iopub.status.idle": "2022-05-07T20:21:53.699362Z",
414
+ "shell.execute_reply": "2022-05-07T20:21:53.698383Z"
415
+ },
416
+ "papermill": {
417
+ "duration": 1.802712,
418
+ "end_time": "2022-05-07T20:21:53.702205",
419
+ "exception": false,
420
+ "start_time": "2022-05-07T20:21:51.899493",
421
+ "status": "completed"
422
+ },
423
+ "tags": []
424
+ },
425
+ "outputs": [
426
+ {
427
+ "name": "stdout",
428
+ "output_type": "stream",
429
+ "text": [
430
+ "--2022-05-07 20:21:52-- https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt\r\n",
431
+ "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...\r\n",
432
+ "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.\r\n",
433
+ "HTTP request sent, awaiting response... 200 OK\r\n",
434
+ "Length: 207 [text/plain]\r\n",
435
+ "Saving to: ‘dict.ltr.txt’\r\n",
436
+ "\r\n",
437
+ "dict.ltr.txt 100%[===================>] 207 --.-KB/s in 0s \r\n",
438
+ "\r\n",
439
+ "2022-05-07 20:21:53 (40.9 MB/s) - ‘dict.ltr.txt’ saved [207/207]\r\n",
440
+ "\r\n"
441
+ ]
442
+ }
443
+ ],
444
+ "source": [
445
+ "!wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": 12,
451
+ "id": "cd0e9762",
452
+ "metadata": {
453
+ "execution": {
454
+ "iopub.execute_input": "2022-05-07T20:21:53.741484Z",
455
+ "iopub.status.busy": "2022-05-07T20:21:53.741123Z",
456
+ "iopub.status.idle": "2022-05-07T20:21:54.715272Z",
457
+ "shell.execute_reply": "2022-05-07T20:21:54.714013Z"
458
+ },
459
+ "papermill": {
460
+ "duration": 0.997334,
461
+ "end_time": "2022-05-07T20:21:54.717846",
462
+ "exception": false,
463
+ "start_time": "2022-05-07T20:21:53.720512",
464
+ "status": "completed"
465
+ },
466
+ "tags": []
467
+ },
468
+ "outputs": [],
469
+ "source": [
470
+ "!for i in *mins.tsv;do b=$(basename $i \".tsv\");mkdir $b; mv $b.tsv $b/train.tsv; mv $b.ltr $b/train.ltr; cp dict.ltr.txt ../input/cmu-us-awb-arctic-fairseq-files/test.* $b/;cp ../input/cmu-us-awb-arctic-fairseq-files/dev.tsv $b/valid.tsv; cp ../input/cmu-us-awb-arctic-fairseq-files/dev.ltr $b/valid.ltr;done"
471
+ ]
472
+ }
473
+ ],
474
+ "metadata": {
475
+ "kernelspec": {
476
+ "display_name": "Python 3",
477
+ "language": "python",
478
+ "name": "python3"
479
+ },
480
+ "language_info": {
481
+ "codemirror_mode": {
482
+ "name": "ipython",
483
+ "version": 3
484
+ },
485
+ "file_extension": ".py",
486
+ "mimetype": "text/x-python",
487
+ "name": "python",
488
+ "nbconvert_exporter": "python",
489
+ "pygments_lexer": "ipython3",
490
+ "version": "3.7.12"
491
+ },
492
+ "papermill": {
493
+ "default_parameters": {},
494
+ "duration": 14.656502,
495
+ "end_time": "2022-05-07T20:21:55.457936",
496
+ "environment_variables": {},
497
+ "exception": null,
498
+ "input_path": "__notebook__.ipynb",
499
+ "output_path": "__notebook__.ipynb",
500
+ "parameters": {},
501
+ "start_time": "2022-05-07T20:21:40.801434",
502
+ "version": "2.3.4"
503
+ }
504
+ },
505
+ "nbformat": 4,
506
+ "nbformat_minor": 5
507
+ }