add notebooks from Kaggle
Browse files
kaggle-notebooks/cmu-us-awb-arctic-fairseq-files.ipynb
ADDED
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "bddfd111",
|
7 |
+
"metadata": {
|
8 |
+
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
|
9 |
+
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
|
10 |
+
"execution": {
|
11 |
+
"iopub.execute_input": "2022-05-07T08:23:49.094601Z",
|
12 |
+
"iopub.status.busy": "2022-05-07T08:23:49.094098Z",
|
13 |
+
"iopub.status.idle": "2022-05-07T08:23:49.103907Z",
|
14 |
+
"shell.execute_reply": "2022-05-07T08:23:49.103002Z"
|
15 |
+
},
|
16 |
+
"papermill": {
|
17 |
+
"duration": 0.026388,
|
18 |
+
"end_time": "2022-05-07T08:23:49.105961",
|
19 |
+
"exception": false,
|
20 |
+
"start_time": "2022-05-07T08:23:49.079573",
|
21 |
+
"status": "completed"
|
22 |
+
},
|
23 |
+
"tags": []
|
24 |
+
},
|
25 |
+
"outputs": [],
|
26 |
+
"source": [
|
27 |
+
"RAWTEXT = \"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/etc/txt.done.data\""
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": 2,
|
33 |
+
"id": "146a024b",
|
34 |
+
"metadata": {
|
35 |
+
"execution": {
|
36 |
+
"iopub.execute_input": "2022-05-07T08:23:49.130710Z",
|
37 |
+
"iopub.status.busy": "2022-05-07T08:23:49.130417Z",
|
38 |
+
"iopub.status.idle": "2022-05-07T08:23:49.135862Z",
|
39 |
+
"shell.execute_reply": "2022-05-07T08:23:49.134985Z"
|
40 |
+
},
|
41 |
+
"papermill": {
|
42 |
+
"duration": 0.020182,
|
43 |
+
"end_time": "2022-05-07T08:23:49.138041",
|
44 |
+
"exception": false,
|
45 |
+
"start_time": "2022-05-07T08:23:49.117859",
|
46 |
+
"status": "completed"
|
47 |
+
},
|
48 |
+
"tags": []
|
49 |
+
},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"NORMS = {\n",
|
53 |
+
" \"0.75\": \"zero point seven five\",\n",
|
54 |
+
" \"t.h\": \"t h\",\n",
|
55 |
+
" \"1880\": \"eighteen eighty\",\n",
|
56 |
+
" \"16\": \"sixteenth\",\n",
|
57 |
+
" \"1908\": \"nineteen oh eight\",\n",
|
58 |
+
" \"18\": \"eighteenth\",\n",
|
59 |
+
" \"17\": \"seventeenth\",\n",
|
60 |
+
" \"29th\": \"twenty ninth\",\n",
|
61 |
+
" \"mrs\": \"misses\",\n",
|
62 |
+
" \"etc\": \"etcetera\",\n",
|
63 |
+
" \"etc.\": \"etcetera\",\n",
|
64 |
+
" \"to-day\": \"today\",\n",
|
65 |
+
" \"to-day's\": \"today's\",\n",
|
66 |
+
" \"to-morrow\": \"tomorrow\"\n",
|
67 |
+
"}"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "code",
|
72 |
+
"execution_count": 3,
|
73 |
+
"id": "46c0c682",
|
74 |
+
"metadata": {
|
75 |
+
"execution": {
|
76 |
+
"iopub.execute_input": "2022-05-07T08:23:49.163049Z",
|
77 |
+
"iopub.status.busy": "2022-05-07T08:23:49.162461Z",
|
78 |
+
"iopub.status.idle": "2022-05-07T08:23:49.170619Z",
|
79 |
+
"shell.execute_reply": "2022-05-07T08:23:49.169730Z"
|
80 |
+
},
|
81 |
+
"papermill": {
|
82 |
+
"duration": 0.022915,
|
83 |
+
"end_time": "2022-05-07T08:23:49.172592",
|
84 |
+
"exception": false,
|
85 |
+
"start_time": "2022-05-07T08:23:49.149677",
|
86 |
+
"status": "completed"
|
87 |
+
},
|
88 |
+
"tags": []
|
89 |
+
},
|
90 |
+
"outputs": [],
|
91 |
+
"source": [
|
92 |
+
"def _check_apos(word):\n",
|
93 |
+
" if word.endswith(\"'s\"):\n",
|
94 |
+
" return word\n",
|
95 |
+
" elif word.endswith(\"s'\"):\n",
|
96 |
+
" return word\n",
|
97 |
+
" elif word.endswith(\"'d\"):\n",
|
98 |
+
" return word\n",
|
99 |
+
" elif word.endswith(\"'ve\"):\n",
|
100 |
+
" return word\n",
|
101 |
+
" elif word.endswith(\"'re\"):\n",
|
102 |
+
" return word\n",
|
103 |
+
" elif word.endswith(\"'ll\"):\n",
|
104 |
+
" return word\n",
|
105 |
+
" elif word.endswith(\"n't\"):\n",
|
106 |
+
" return word\n",
|
107 |
+
" elif word.endswith(\"'ve\"):\n",
|
108 |
+
" return word\n",
|
109 |
+
" elif word in [\"i'm\", \"'em\", \"o'brien\"]:\n",
|
110 |
+
" return word\n",
|
111 |
+
" else:\n",
|
112 |
+
" return word.replace(\"'\", \"\")\n",
|
113 |
+
"\n",
|
114 |
+
"def fix_apos(text):\n",
|
115 |
+
" words = [_check_apos(w) for w in text.split(\" \")]\n",
|
116 |
+
" return \" \".join(words)"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": 4,
|
122 |
+
"id": "ad9ec20c",
|
123 |
+
"metadata": {
|
124 |
+
"execution": {
|
125 |
+
"iopub.execute_input": "2022-05-07T08:23:49.197464Z",
|
126 |
+
"iopub.status.busy": "2022-05-07T08:23:49.197091Z",
|
127 |
+
"iopub.status.idle": "2022-05-07T08:23:49.206427Z",
|
128 |
+
"shell.execute_reply": "2022-05-07T08:23:49.205485Z"
|
129 |
+
},
|
130 |
+
"papermill": {
|
131 |
+
"duration": 0.024203,
|
132 |
+
"end_time": "2022-05-07T08:23:49.208449",
|
133 |
+
"exception": false,
|
134 |
+
"start_time": "2022-05-07T08:23:49.184246",
|
135 |
+
"status": "completed"
|
136 |
+
},
|
137 |
+
"tags": []
|
138 |
+
},
|
139 |
+
"outputs": [],
|
140 |
+
"source": [
|
141 |
+
"def normalise(text):\n",
|
142 |
+
" if text[-1] == \".\":\n",
|
143 |
+
" text = text[:-1]\n",
|
144 |
+
" text = text.lower()\n",
|
145 |
+
" words = []\n",
|
146 |
+
" text = text.replace(\",\", \"\")\n",
|
147 |
+
" for word in text.split(\" \"):\n",
|
148 |
+
" if word in NORMS:\n",
|
149 |
+
" words.append(NORMS[word])\n",
|
150 |
+
" else:\n",
|
151 |
+
" words.append(word)\n",
|
152 |
+
" text = \" \".join(words)\n",
|
153 |
+
" text = text.replace(\".\", \"\")\n",
|
154 |
+
" text = text.replace(\"?\", \"\")\n",
|
155 |
+
" text = text.replace(\"!\", \"\")\n",
|
156 |
+
" text = text.replace(\":\", \"\")\n",
|
157 |
+
" text = text.replace(\";\", \"\")\n",
|
158 |
+
" text = text.replace(\"--\", \" \")\n",
|
159 |
+
" text = text.replace(\" \", \" \")\n",
|
160 |
+
" text = text.replace(\" - \", \" \")\n",
|
161 |
+
" text = text.replace(\"to- morrow\", \"tomorrow\")\n",
|
162 |
+
" text = fix_apos(text)\n",
|
163 |
+
" text = text.replace(\"-\", \" \")\n",
|
164 |
+
" return text.strip().upper()"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 5,
|
170 |
+
"id": "aec22c8a",
|
171 |
+
"metadata": {
|
172 |
+
"execution": {
|
173 |
+
"iopub.execute_input": "2022-05-07T08:23:49.233246Z",
|
174 |
+
"iopub.status.busy": "2022-05-07T08:23:49.232952Z",
|
175 |
+
"iopub.status.idle": "2022-05-07T08:23:49.275141Z",
|
176 |
+
"shell.execute_reply": "2022-05-07T08:23:49.274337Z"
|
177 |
+
},
|
178 |
+
"papermill": {
|
179 |
+
"duration": 0.057432,
|
180 |
+
"end_time": "2022-05-07T08:23:49.277432",
|
181 |
+
"exception": false,
|
182 |
+
"start_time": "2022-05-07T08:23:49.220000",
|
183 |
+
"status": "completed"
|
184 |
+
},
|
185 |
+
"tags": []
|
186 |
+
},
|
187 |
+
"outputs": [],
|
188 |
+
"source": [
|
189 |
+
"data = {}\n",
|
190 |
+
"with open(RAWTEXT) as inf:\n",
|
191 |
+
" for line in inf.readlines():\n",
|
192 |
+
" first_space = line.find(' ')\n",
|
193 |
+
" first_quote = line.find('\"')\n",
|
194 |
+
" last_quote = line.rfind('\"')\n",
|
195 |
+
" id = line[first_space+1:first_quote].strip()\n",
|
196 |
+
" text = line[first_quote+1:last_quote]\n",
|
197 |
+
" data[id] = normalise(text)"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": 6,
|
203 |
+
"id": "c4d12ef6",
|
204 |
+
"metadata": {
|
205 |
+
"execution": {
|
206 |
+
"iopub.execute_input": "2022-05-07T08:23:49.302618Z",
|
207 |
+
"iopub.status.busy": "2022-05-07T08:23:49.302155Z",
|
208 |
+
"iopub.status.idle": "2022-05-07T08:23:49.306841Z",
|
209 |
+
"shell.execute_reply": "2022-05-07T08:23:49.306240Z"
|
210 |
+
},
|
211 |
+
"papermill": {
|
212 |
+
"duration": 0.019827,
|
213 |
+
"end_time": "2022-05-07T08:23:49.308829",
|
214 |
+
"exception": false,
|
215 |
+
"start_time": "2022-05-07T08:23:49.289002",
|
216 |
+
"status": "completed"
|
217 |
+
},
|
218 |
+
"tags": []
|
219 |
+
},
|
220 |
+
"outputs": [],
|
221 |
+
"source": [
|
222 |
+
"with open(\"text.tsv\", \"w\") as of:\n",
|
223 |
+
" for id in data.keys():\n",
|
224 |
+
" of.write(f\"{id}\\t{data[id]}\\n\")"
|
225 |
+
]
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"cell_type": "code",
|
229 |
+
"execution_count": 7,
|
230 |
+
"id": "62dc9132",
|
231 |
+
"metadata": {
|
232 |
+
"execution": {
|
233 |
+
"iopub.execute_input": "2022-05-07T08:23:49.333170Z",
|
234 |
+
"iopub.status.busy": "2022-05-07T08:23:49.332878Z",
|
235 |
+
"iopub.status.idle": "2022-05-07T08:23:58.857125Z",
|
236 |
+
"shell.execute_reply": "2022-05-07T08:23:58.855696Z"
|
237 |
+
},
|
238 |
+
"papermill": {
|
239 |
+
"duration": 9.539562,
|
240 |
+
"end_time": "2022-05-07T08:23:58.859914",
|
241 |
+
"exception": false,
|
242 |
+
"start_time": "2022-05-07T08:23:49.320352",
|
243 |
+
"status": "completed"
|
244 |
+
},
|
245 |
+
"tags": []
|
246 |
+
},
|
247 |
+
"outputs": [
|
248 |
+
{
|
249 |
+
"name": "stdout",
|
250 |
+
"output_type": "stream",
|
251 |
+
"text": [
|
252 |
+
"Total: 4777.0\n"
|
253 |
+
]
|
254 |
+
}
|
255 |
+
],
|
256 |
+
"source": [
|
257 |
+
"from pathlib import Path\n",
|
258 |
+
"import soundfile as sf\n",
|
259 |
+
"\n",
|
260 |
+
"total = 0\n",
|
261 |
+
"WAVPATH = Path(\"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/wav/\")\n",
|
262 |
+
"with open(\"frames.tsv\", \"w\") as of:\n",
|
263 |
+
" for wav in WAVPATH.glob(\"*.wav\"):\n",
|
264 |
+
" frames, sr = sf.read(str(wav))\n",
|
265 |
+
" assert sr == 16000\n",
|
266 |
+
" total += len(frames)\n",
|
267 |
+
" of.write(f\"{wav.stem}.wav\\t{len(frames)}\\n\")\n",
|
268 |
+
"print(\"Total:\", total / 16000)"
|
269 |
+
]
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"cell_type": "code",
|
273 |
+
"execution_count": 8,
|
274 |
+
"id": "71a8a65f",
|
275 |
+
"metadata": {
|
276 |
+
"execution": {
|
277 |
+
"iopub.execute_input": "2022-05-07T08:23:58.896455Z",
|
278 |
+
"iopub.status.busy": "2022-05-07T08:23:58.895715Z",
|
279 |
+
"iopub.status.idle": "2022-05-07T08:24:01.181186Z",
|
280 |
+
"shell.execute_reply": "2022-05-07T08:24:01.179981Z"
|
281 |
+
},
|
282 |
+
"papermill": {
|
283 |
+
"duration": 2.308171,
|
284 |
+
"end_time": "2022-05-07T08:24:01.183774",
|
285 |
+
"exception": false,
|
286 |
+
"start_time": "2022-05-07T08:23:58.875603",
|
287 |
+
"status": "completed"
|
288 |
+
},
|
289 |
+
"tags": []
|
290 |
+
},
|
291 |
+
"outputs": [],
|
292 |
+
"source": [
|
293 |
+
"lines=!wc -l frames.tsv|awk '{print $1}'\n",
|
294 |
+
"!tail -n 114 frames.tsv |head -n 57 > test.tsv\n",
|
295 |
+
"!tail -n 114 frames.tsv |tail -n 57 > dev.tsv\n",
|
296 |
+
"!head -n $((1138-114)) frames.tsv > train.tsv"
|
297 |
+
]
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"cell_type": "code",
|
301 |
+
"execution_count": 9,
|
302 |
+
"id": "6c7413f1",
|
303 |
+
"metadata": {
|
304 |
+
"execution": {
|
305 |
+
"iopub.execute_input": "2022-05-07T08:24:01.210537Z",
|
306 |
+
"iopub.status.busy": "2022-05-07T08:24:01.209625Z",
|
307 |
+
"iopub.status.idle": "2022-05-07T08:24:01.215252Z",
|
308 |
+
"shell.execute_reply": "2022-05-07T08:24:01.214668Z"
|
309 |
+
},
|
310 |
+
"papermill": {
|
311 |
+
"duration": 0.021169,
|
312 |
+
"end_time": "2022-05-07T08:24:01.217174",
|
313 |
+
"exception": false,
|
314 |
+
"start_time": "2022-05-07T08:24:01.196005",
|
315 |
+
"status": "completed"
|
316 |
+
},
|
317 |
+
"tags": []
|
318 |
+
},
|
319 |
+
"outputs": [],
|
320 |
+
"source": [
|
321 |
+
"def do_fairseq(text):\n",
|
322 |
+
" words = text.split(\" \")\n",
|
323 |
+
" owords = [\" \".join(w) for w in words]\n",
|
324 |
+
" return \" | \".join(owords) + \" |\""
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "code",
|
329 |
+
"execution_count": 10,
|
330 |
+
"id": "9342e47e",
|
331 |
+
"metadata": {
|
332 |
+
"execution": {
|
333 |
+
"iopub.execute_input": "2022-05-07T08:24:01.242716Z",
|
334 |
+
"iopub.status.busy": "2022-05-07T08:24:01.242299Z",
|
335 |
+
"iopub.status.idle": "2022-05-07T08:24:01.257797Z",
|
336 |
+
"shell.execute_reply": "2022-05-07T08:24:01.257054Z"
|
337 |
+
},
|
338 |
+
"papermill": {
|
339 |
+
"duration": 0.031049,
|
340 |
+
"end_time": "2022-05-07T08:24:01.260092",
|
341 |
+
"exception": false,
|
342 |
+
"start_time": "2022-05-07T08:24:01.229043",
|
343 |
+
"status": "completed"
|
344 |
+
},
|
345 |
+
"tags": []
|
346 |
+
},
|
347 |
+
"outputs": [],
|
348 |
+
"source": [
|
349 |
+
"for part in [\"test\", \"train\", \"dev\"]:\n",
|
350 |
+
" ids = []\n",
|
351 |
+
" with open(f\"{part}.ltr\", \"w\") as of, open(f\"{part}.tsv\") as inf:\n",
|
352 |
+
" for line in inf.readlines():\n",
|
353 |
+
" if \"\\t\" in line:\n",
|
354 |
+
" parts = line.strip().split(\"\\t\")\n",
|
355 |
+
" id = parts[0].replace(\".wav\", \"\")\n",
|
356 |
+
" of.write(do_fairseq(data[id]) + \"\\n\")"
|
357 |
+
]
|
358 |
+
}
|
359 |
+
],
|
360 |
+
"metadata": {
|
361 |
+
"kernelspec": {
|
362 |
+
"display_name": "Python 3",
|
363 |
+
"language": "python",
|
364 |
+
"name": "python3"
|
365 |
+
},
|
366 |
+
"language_info": {
|
367 |
+
"codemirror_mode": {
|
368 |
+
"name": "ipython",
|
369 |
+
"version": 3
|
370 |
+
},
|
371 |
+
"file_extension": ".py",
|
372 |
+
"mimetype": "text/x-python",
|
373 |
+
"name": "python",
|
374 |
+
"nbconvert_exporter": "python",
|
375 |
+
"pygments_lexer": "ipython3",
|
376 |
+
"version": "3.7.12"
|
377 |
+
},
|
378 |
+
"papermill": {
|
379 |
+
"default_parameters": {},
|
380 |
+
"duration": 23.168986,
|
381 |
+
"end_time": "2022-05-07T08:24:01.994036",
|
382 |
+
"environment_variables": {},
|
383 |
+
"exception": null,
|
384 |
+
"input_path": "__notebook__.ipynb",
|
385 |
+
"output_path": "__notebook__.ipynb",
|
386 |
+
"parameters": {},
|
387 |
+
"start_time": "2022-05-07T08:23:38.825050",
|
388 |
+
"version": "2.3.4"
|
389 |
+
}
|
390 |
+
},
|
391 |
+
"nbformat": 4,
|
392 |
+
"nbformat_minor": 5
|
393 |
+
}
|
kaggle-notebooks/create-awb-splits.ipynb
ADDED
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "d250a6ca",
|
6 |
+
"metadata": {
|
7 |
+
"papermill": {
|
8 |
+
"duration": 0.015615,
|
9 |
+
"end_time": "2022-05-07T20:21:51.320467",
|
10 |
+
"exception": false,
|
11 |
+
"start_time": "2022-05-07T20:21:51.304852",
|
12 |
+
"status": "completed"
|
13 |
+
},
|
14 |
+
"tags": []
|
15 |
+
},
|
16 |
+
"source": [
|
17 |
+
"# Load frame lengths"
|
18 |
+
]
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": 1,
|
23 |
+
"id": "c67326d6",
|
24 |
+
"metadata": {
|
25 |
+
"execution": {
|
26 |
+
"iopub.execute_input": "2022-05-07T20:21:51.351926Z",
|
27 |
+
"iopub.status.busy": "2022-05-07T20:21:51.351146Z",
|
28 |
+
"iopub.status.idle": "2022-05-07T20:21:51.375423Z",
|
29 |
+
"shell.execute_reply": "2022-05-07T20:21:51.374683Z"
|
30 |
+
},
|
31 |
+
"papermill": {
|
32 |
+
"duration": 0.042741,
|
33 |
+
"end_time": "2022-05-07T20:21:51.377820",
|
34 |
+
"exception": false,
|
35 |
+
"start_time": "2022-05-07T20:21:51.335079",
|
36 |
+
"status": "completed"
|
37 |
+
},
|
38 |
+
"tags": []
|
39 |
+
},
|
40 |
+
"outputs": [],
|
41 |
+
"source": [
|
42 |
+
"train_frames = {}\n",
|
43 |
+
"\n",
|
44 |
+
"total = 0\n",
|
45 |
+
"with open(\"../input/cmu-us-awb-arctic-fairseq-files/train.tsv\") as f:\n",
|
46 |
+
" for line in f.readlines():\n",
|
47 |
+
" if not \"\\t\" in line:\n",
|
48 |
+
" continue\n",
|
49 |
+
" pieces = line.strip().split(\"\\t\")\n",
|
50 |
+
" total += int(pieces[1])\n",
|
51 |
+
" assert len(pieces) == 2\n",
|
52 |
+
" id = pieces[0].replace(\".wav\", \"\")\n",
|
53 |
+
" train_frames[id] = int(pieces[1])"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": 2,
|
59 |
+
"id": "b7994029",
|
60 |
+
"metadata": {
|
61 |
+
"execution": {
|
62 |
+
"iopub.execute_input": "2022-05-07T20:21:51.408607Z",
|
63 |
+
"iopub.status.busy": "2022-05-07T20:21:51.408333Z",
|
64 |
+
"iopub.status.idle": "2022-05-07T20:21:51.415606Z",
|
65 |
+
"shell.execute_reply": "2022-05-07T20:21:51.414918Z"
|
66 |
+
},
|
67 |
+
"papermill": {
|
68 |
+
"duration": 0.025832,
|
69 |
+
"end_time": "2022-05-07T20:21:51.418472",
|
70 |
+
"exception": false,
|
71 |
+
"start_time": "2022-05-07T20:21:51.392640",
|
72 |
+
"status": "completed"
|
73 |
+
},
|
74 |
+
"tags": []
|
75 |
+
},
|
76 |
+
"outputs": [
|
77 |
+
{
|
78 |
+
"data": {
|
79 |
+
"text/plain": [
|
80 |
+
"4295.0"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
"execution_count": 2,
|
84 |
+
"metadata": {},
|
85 |
+
"output_type": "execute_result"
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"source": [
|
89 |
+
"total / 16000"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": 3,
|
95 |
+
"id": "3f849d42",
|
96 |
+
"metadata": {
|
97 |
+
"execution": {
|
98 |
+
"iopub.execute_input": "2022-05-07T20:21:51.453347Z",
|
99 |
+
"iopub.status.busy": "2022-05-07T20:21:51.452190Z",
|
100 |
+
"iopub.status.idle": "2022-05-07T20:21:51.458730Z",
|
101 |
+
"shell.execute_reply": "2022-05-07T20:21:51.457854Z"
|
102 |
+
},
|
103 |
+
"papermill": {
|
104 |
+
"duration": 0.025537,
|
105 |
+
"end_time": "2022-05-07T20:21:51.461052",
|
106 |
+
"exception": false,
|
107 |
+
"start_time": "2022-05-07T20:21:51.435515",
|
108 |
+
"status": "completed"
|
109 |
+
},
|
110 |
+
"tags": []
|
111 |
+
},
|
112 |
+
"outputs": [
|
113 |
+
{
|
114 |
+
"data": {
|
115 |
+
"text/plain": [
|
116 |
+
"71.58333333333333"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
"execution_count": 3,
|
120 |
+
"metadata": {},
|
121 |
+
"output_type": "execute_result"
|
122 |
+
}
|
123 |
+
],
|
124 |
+
"source": [
|
125 |
+
"4295 / 60"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"cell_type": "code",
|
130 |
+
"execution_count": 4,
|
131 |
+
"id": "9f9a6faf",
|
132 |
+
"metadata": {
|
133 |
+
"execution": {
|
134 |
+
"iopub.execute_input": "2022-05-07T20:21:51.493604Z",
|
135 |
+
"iopub.status.busy": "2022-05-07T20:21:51.492779Z",
|
136 |
+
"iopub.status.idle": "2022-05-07T20:21:51.497692Z",
|
137 |
+
"shell.execute_reply": "2022-05-07T20:21:51.497031Z"
|
138 |
+
},
|
139 |
+
"papermill": {
|
140 |
+
"duration": 0.023598,
|
141 |
+
"end_time": "2022-05-07T20:21:51.499886",
|
142 |
+
"exception": false,
|
143 |
+
"start_time": "2022-05-07T20:21:51.476288",
|
144 |
+
"status": "completed"
|
145 |
+
},
|
146 |
+
"tags": []
|
147 |
+
},
|
148 |
+
"outputs": [],
|
149 |
+
"source": [
|
150 |
+
"MINS = [i * 5 for i in range(1, 13)]\n"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "code",
|
155 |
+
"execution_count": 5,
|
156 |
+
"id": "48ffe86d",
|
157 |
+
"metadata": {
|
158 |
+
"execution": {
|
159 |
+
"iopub.execute_input": "2022-05-07T20:21:51.532980Z",
|
160 |
+
"iopub.status.busy": "2022-05-07T20:21:51.532352Z",
|
161 |
+
"iopub.status.idle": "2022-05-07T20:21:51.539573Z",
|
162 |
+
"shell.execute_reply": "2022-05-07T20:21:51.538702Z"
|
163 |
+
},
|
164 |
+
"papermill": {
|
165 |
+
"duration": 0.026147,
|
166 |
+
"end_time": "2022-05-07T20:21:51.541893",
|
167 |
+
"exception": false,
|
168 |
+
"start_time": "2022-05-07T20:21:51.515746",
|
169 |
+
"status": "completed"
|
170 |
+
},
|
171 |
+
"tags": []
|
172 |
+
},
|
173 |
+
"outputs": [
|
174 |
+
{
|
175 |
+
"data": {
|
176 |
+
"text/plain": [
|
177 |
+
"[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
"execution_count": 5,
|
181 |
+
"metadata": {},
|
182 |
+
"output_type": "execute_result"
|
183 |
+
}
|
184 |
+
],
|
185 |
+
"source": [
|
186 |
+
"MINS"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": 6,
|
192 |
+
"id": "c2190118",
|
193 |
+
"metadata": {
|
194 |
+
"execution": {
|
195 |
+
"iopub.execute_input": "2022-05-07T20:21:51.575902Z",
|
196 |
+
"iopub.status.busy": "2022-05-07T20:21:51.575081Z",
|
197 |
+
"iopub.status.idle": "2022-05-07T20:21:51.579726Z",
|
198 |
+
"shell.execute_reply": "2022-05-07T20:21:51.578698Z"
|
199 |
+
},
|
200 |
+
"papermill": {
|
201 |
+
"duration": 0.02397,
|
202 |
+
"end_time": "2022-05-07T20:21:51.581997",
|
203 |
+
"exception": false,
|
204 |
+
"start_time": "2022-05-07T20:21:51.558027",
|
205 |
+
"status": "completed"
|
206 |
+
},
|
207 |
+
"tags": []
|
208 |
+
},
|
209 |
+
"outputs": [],
|
210 |
+
"source": [
|
211 |
+
"WAVDIR = \"/kaggle/input/ljspeech-for-asr/wav16\""
|
212 |
+
]
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "markdown",
|
216 |
+
"id": "0dd1b778",
|
217 |
+
"metadata": {
|
218 |
+
"papermill": {
|
219 |
+
"duration": 0.016062,
|
220 |
+
"end_time": "2022-05-07T20:21:51.613958",
|
221 |
+
"exception": false,
|
222 |
+
"start_time": "2022-05-07T20:21:51.597896",
|
223 |
+
"status": "completed"
|
224 |
+
},
|
225 |
+
"tags": []
|
226 |
+
},
|
227 |
+
"source": [
|
228 |
+
"# Minute splits"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": 7,
|
234 |
+
"id": "b39e450f",
|
235 |
+
"metadata": {
|
236 |
+
"execution": {
|
237 |
+
"iopub.execute_input": "2022-05-07T20:21:51.647074Z",
|
238 |
+
"iopub.status.busy": "2022-05-07T20:21:51.646766Z",
|
239 |
+
"iopub.status.idle": "2022-05-07T20:21:51.665170Z",
|
240 |
+
"shell.execute_reply": "2022-05-07T20:21:51.664195Z"
|
241 |
+
},
|
242 |
+
"papermill": {
|
243 |
+
"duration": 0.03809,
|
244 |
+
"end_time": "2022-05-07T20:21:51.667729",
|
245 |
+
"exception": false,
|
246 |
+
"start_time": "2022-05-07T20:21:51.629639",
|
247 |
+
"status": "completed"
|
248 |
+
},
|
249 |
+
"tags": []
|
250 |
+
},
|
251 |
+
"outputs": [],
|
252 |
+
"source": [
|
253 |
+
"for min in MINS:\n",
|
254 |
+
" frames = min * 60 * 16000\n",
|
255 |
+
" idlist = [k for k in train_frames.keys()]\n",
|
256 |
+
" outtsv = f\"{min}mins.tsv\"\n",
|
257 |
+
" with open(outtsv, \"w\") as of:\n",
|
258 |
+
" current = 0\n",
|
259 |
+
" of.write(f\"{WAVDIR}\\n\")\n",
|
260 |
+
" while frames > 0 and frames > current:\n",
|
261 |
+
" id = idlist.pop(0)\n",
|
262 |
+
" current = train_frames[id]\n",
|
263 |
+
" of.write(f\"{id}.wav\\t{current}\\n\")\n",
|
264 |
+
" frames = frames - current\n",
|
265 |
+
" max = 0\n",
|
266 |
+
" maxid = \"\"\n",
|
267 |
+
" for id in idlist:\n",
|
268 |
+
" time = train_frames[id]\n",
|
269 |
+
" if time > current:\n",
|
270 |
+
" continue\n",
|
271 |
+
" if time > max:\n",
|
272 |
+
" max = time\n",
|
273 |
+
" maxid = id\n",
|
274 |
+
" of.write(f\"{maxid}.wav\\t{max}\\n\")"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "markdown",
|
279 |
+
"id": "f4127c0d",
|
280 |
+
"metadata": {
|
281 |
+
"papermill": {
|
282 |
+
"duration": 0.016089,
|
283 |
+
"end_time": "2022-05-07T20:21:51.699865",
|
284 |
+
"exception": false,
|
285 |
+
"start_time": "2022-05-07T20:21:51.683776",
|
286 |
+
"status": "completed"
|
287 |
+
},
|
288 |
+
"tags": []
|
289 |
+
},
|
290 |
+
"source": [
|
291 |
+
"# Generate `ltr` files"
|
292 |
+
]
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"cell_type": "code",
|
296 |
+
"execution_count": 8,
|
297 |
+
"id": "df62fef6",
|
298 |
+
"metadata": {
|
299 |
+
"execution": {
|
300 |
+
"iopub.execute_input": "2022-05-07T20:21:51.734361Z",
|
301 |
+
"iopub.status.busy": "2022-05-07T20:21:51.734065Z",
|
302 |
+
"iopub.status.idle": "2022-05-07T20:21:51.739428Z",
|
303 |
+
"shell.execute_reply": "2022-05-07T20:21:51.738516Z"
|
304 |
+
},
|
305 |
+
"papermill": {
|
306 |
+
"duration": 0.024729,
|
307 |
+
"end_time": "2022-05-07T20:21:51.741510",
|
308 |
+
"exception": false,
|
309 |
+
"start_time": "2022-05-07T20:21:51.716781",
|
310 |
+
"status": "completed"
|
311 |
+
},
|
312 |
+
"tags": []
|
313 |
+
},
|
314 |
+
"outputs": [],
|
315 |
+
"source": [
|
316 |
+
"def fairseqify(text):\n",
|
317 |
+
" text = text.strip().replace(\" \", \" \")\n",
|
318 |
+
" words = text.split(\" \")\n",
|
319 |
+
" spread = [\" \".join(a) for a in words]\n",
|
320 |
+
" return \" | \".join(spread) + \" |\""
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"cell_type": "code",
|
325 |
+
"execution_count": 9,
|
326 |
+
"id": "1f1f0ede",
|
327 |
+
"metadata": {
|
328 |
+
"execution": {
|
329 |
+
"iopub.execute_input": "2022-05-07T20:21:51.775253Z",
|
330 |
+
"iopub.status.busy": "2022-05-07T20:21:51.774938Z",
|
331 |
+
"iopub.status.idle": "2022-05-07T20:21:51.796186Z",
|
332 |
+
"shell.execute_reply": "2022-05-07T20:21:51.795461Z"
|
333 |
+
},
|
334 |
+
"papermill": {
|
335 |
+
"duration": 0.040472,
|
336 |
+
"end_time": "2022-05-07T20:21:51.798476",
|
337 |
+
"exception": false,
|
338 |
+
"start_time": "2022-05-07T20:21:51.758004",
|
339 |
+
"status": "completed"
|
340 |
+
},
|
341 |
+
"tags": []
|
342 |
+
},
|
343 |
+
"outputs": [],
|
344 |
+
"source": [
|
345 |
+
"transcripts = {}\n",
|
346 |
+
"with open(\"../input/cmu-us-awb-arctic-fairseq-files/text.tsv\") as tf:\n",
|
347 |
+
" for line in tf.readlines():\n",
|
348 |
+
" line = line.strip()\n",
|
349 |
+
" if not \"\\t\" in line:\n",
|
350 |
+
" pass\n",
|
351 |
+
" parts = line.split(\"\\t\")\n",
|
352 |
+
" assert len(parts) == 2\n",
|
353 |
+
" transcripts[parts[0]] = fairseqify(parts[1])"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "code",
|
358 |
+
"execution_count": 10,
|
359 |
+
"id": "b39c56c7",
|
360 |
+
"metadata": {
|
361 |
+
"execution": {
|
362 |
+
"iopub.execute_input": "2022-05-07T20:21:51.832082Z",
|
363 |
+
"iopub.status.busy": "2022-05-07T20:21:51.831792Z",
|
364 |
+
"iopub.status.idle": "2022-05-07T20:21:51.848438Z",
|
365 |
+
"shell.execute_reply": "2022-05-07T20:21:51.847533Z"
|
366 |
+
},
|
367 |
+
"papermill": {
|
368 |
+
"duration": 0.036232,
|
369 |
+
"end_time": "2022-05-07T20:21:51.850791",
|
370 |
+
"exception": false,
|
371 |
+
"start_time": "2022-05-07T20:21:51.814559",
|
372 |
+
"status": "completed"
|
373 |
+
},
|
374 |
+
"tags": []
|
375 |
+
},
|
376 |
+
"outputs": [],
|
377 |
+
"source": [
|
378 |
+
"import glob\n",
|
379 |
+
"for tsv in glob.glob(\"*.tsv\"):\n",
|
380 |
+
" out = tsv.replace(\".tsv\", \".ltr\")\n",
|
381 |
+
" with open(tsv) as inf, open(out, \"w\") as outf:\n",
|
382 |
+
" for line in inf.readlines()[1:]:\n",
|
383 |
+
" id, _ = line.split(\"\\t\")\n",
|
384 |
+
" id = id.replace(\".wav\", \"\")\n",
|
385 |
+
" outf.write(f\"{transcripts[id]}\\n\")"
|
386 |
+
]
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"cell_type": "markdown",
|
390 |
+
"id": "b5d1f3dc",
|
391 |
+
"metadata": {
|
392 |
+
"papermill": {
|
393 |
+
"duration": 0.015826,
|
394 |
+
"end_time": "2022-05-07T20:21:51.883384",
|
395 |
+
"exception": false,
|
396 |
+
"start_time": "2022-05-07T20:21:51.867558",
|
397 |
+
"status": "completed"
|
398 |
+
},
|
399 |
+
"tags": []
|
400 |
+
},
|
401 |
+
"source": [
|
402 |
+
"# Tidy up"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "code",
|
407 |
+
"execution_count": 11,
|
408 |
+
"id": "8bf75958",
|
409 |
+
"metadata": {
|
410 |
+
"execution": {
|
411 |
+
"iopub.execute_input": "2022-05-07T20:21:51.917484Z",
|
412 |
+
"iopub.status.busy": "2022-05-07T20:21:51.917020Z",
|
413 |
+
"iopub.status.idle": "2022-05-07T20:21:53.699362Z",
|
414 |
+
"shell.execute_reply": "2022-05-07T20:21:53.698383Z"
|
415 |
+
},
|
416 |
+
"papermill": {
|
417 |
+
"duration": 1.802712,
|
418 |
+
"end_time": "2022-05-07T20:21:53.702205",
|
419 |
+
"exception": false,
|
420 |
+
"start_time": "2022-05-07T20:21:51.899493",
|
421 |
+
"status": "completed"
|
422 |
+
},
|
423 |
+
"tags": []
|
424 |
+
},
|
425 |
+
"outputs": [
|
426 |
+
{
|
427 |
+
"name": "stdout",
|
428 |
+
"output_type": "stream",
|
429 |
+
"text": [
|
430 |
+
"--2022-05-07 20:21:52-- https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt\r\n",
|
431 |
+
"Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...\r\n",
|
432 |
+
"Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.\r\n",
|
433 |
+
"HTTP request sent, awaiting response... 200 OK\r\n",
|
434 |
+
"Length: 207 [text/plain]\r\n",
|
435 |
+
"Saving to: ‘dict.ltr.txt’\r\n",
|
436 |
+
"\r\n",
|
437 |
+
"dict.ltr.txt 100%[===================>] 207 --.-KB/s in 0s \r\n",
|
438 |
+
"\r\n",
|
439 |
+
"2022-05-07 20:21:53 (40.9 MB/s) - ‘dict.ltr.txt’ saved [207/207]\r\n",
|
440 |
+
"\r\n"
|
441 |
+
]
|
442 |
+
}
|
443 |
+
],
|
444 |
+
"source": [
|
445 |
+
"!wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt"
|
446 |
+
]
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"cell_type": "code",
|
450 |
+
"execution_count": 12,
|
451 |
+
"id": "cd0e9762",
|
452 |
+
"metadata": {
|
453 |
+
"execution": {
|
454 |
+
"iopub.execute_input": "2022-05-07T20:21:53.741484Z",
|
455 |
+
"iopub.status.busy": "2022-05-07T20:21:53.741123Z",
|
456 |
+
"iopub.status.idle": "2022-05-07T20:21:54.715272Z",
|
457 |
+
"shell.execute_reply": "2022-05-07T20:21:54.714013Z"
|
458 |
+
},
|
459 |
+
"papermill": {
|
460 |
+
"duration": 0.997334,
|
461 |
+
"end_time": "2022-05-07T20:21:54.717846",
|
462 |
+
"exception": false,
|
463 |
+
"start_time": "2022-05-07T20:21:53.720512",
|
464 |
+
"status": "completed"
|
465 |
+
},
|
466 |
+
"tags": []
|
467 |
+
},
|
468 |
+
"outputs": [],
|
469 |
+
"source": [
|
470 |
+
"!for i in *mins.tsv;do b=$(basename $i \".tsv\");mkdir $b; mv $b.tsv $b/train.tsv; mv $b.ltr $b/train.ltr; cp dict.ltr.txt ../input/cmu-us-awb-arctic-fairseq-files/test.* $b/;cp ../input/cmu-us-awb-arctic-fairseq-files/dev.tsv $b/valid.tsv; cp ../input/cmu-us-awb-arctic-fairseq-files/dev.ltr $b/valid.ltr;done"
|
471 |
+
]
|
472 |
+
}
|
473 |
+
],
|
474 |
+
"metadata": {
|
475 |
+
"kernelspec": {
|
476 |
+
"display_name": "Python 3",
|
477 |
+
"language": "python",
|
478 |
+
"name": "python3"
|
479 |
+
},
|
480 |
+
"language_info": {
|
481 |
+
"codemirror_mode": {
|
482 |
+
"name": "ipython",
|
483 |
+
"version": 3
|
484 |
+
},
|
485 |
+
"file_extension": ".py",
|
486 |
+
"mimetype": "text/x-python",
|
487 |
+
"name": "python",
|
488 |
+
"nbconvert_exporter": "python",
|
489 |
+
"pygments_lexer": "ipython3",
|
490 |
+
"version": "3.7.12"
|
491 |
+
},
|
492 |
+
"papermill": {
|
493 |
+
"default_parameters": {},
|
494 |
+
"duration": 14.656502,
|
495 |
+
"end_time": "2022-05-07T20:21:55.457936",
|
496 |
+
"environment_variables": {},
|
497 |
+
"exception": null,
|
498 |
+
"input_path": "__notebook__.ipynb",
|
499 |
+
"output_path": "__notebook__.ipynb",
|
500 |
+
"parameters": {},
|
501 |
+
"start_time": "2022-05-07T20:21:40.801434",
|
502 |
+
"version": "2.3.4"
|
503 |
+
}
|
504 |
+
},
|
505 |
+
"nbformat": 4,
|
506 |
+
"nbformat_minor": 5
|
507 |
+
}
|