{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "bddfd111", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2022-05-07T08:23:49.094601Z", "iopub.status.busy": "2022-05-07T08:23:49.094098Z", "iopub.status.idle": "2022-05-07T08:23:49.103907Z", "shell.execute_reply": "2022-05-07T08:23:49.103002Z" }, "papermill": { "duration": 0.026388, "end_time": "2022-05-07T08:23:49.105961", "exception": false, "start_time": "2022-05-07T08:23:49.079573", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "RAWTEXT = \"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/etc/txt.done.data\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "146a024b", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.130710Z", "iopub.status.busy": "2022-05-07T08:23:49.130417Z", "iopub.status.idle": "2022-05-07T08:23:49.135862Z", "shell.execute_reply": "2022-05-07T08:23:49.134985Z" }, "papermill": { "duration": 0.020182, "end_time": "2022-05-07T08:23:49.138041", "exception": false, "start_time": "2022-05-07T08:23:49.117859", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "NORMS = {\n", " \"0.75\": \"zero point seven five\",\n", " \"t.h\": \"t h\",\n", " \"1880\": \"eighteen eighty\",\n", " \"16\": \"sixteenth\",\n", " \"1908\": \"nineteen oh eight\",\n", " \"18\": \"eighteenth\",\n", " \"17\": \"seventeenth\",\n", " \"29th\": \"twenty ninth\",\n", " \"mrs\": \"misses\",\n", " \"etc\": \"etcetera\",\n", " \"etc.\": \"etcetera\",\n", " \"to-day\": \"today\",\n", " \"to-day's\": \"today's\",\n", " \"to-morrow\": \"tomorrow\"\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "id": "46c0c682", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.163049Z", "iopub.status.busy": "2022-05-07T08:23:49.162461Z", "iopub.status.idle": "2022-05-07T08:23:49.170619Z", "shell.execute_reply": "2022-05-07T08:23:49.169730Z" }, "papermill": { "duration": 0.022915, "end_time": "2022-05-07T08:23:49.172592", "exception": false, "start_time": "2022-05-07T08:23:49.149677", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def _check_apos(word):\n", " if word.endswith(\"'s\"):\n", " return word\n", " elif word.endswith(\"s'\"):\n", " return word\n", " elif word.endswith(\"'d\"):\n", " return word\n", " elif word.endswith(\"'ve\"):\n", " return word\n", " elif word.endswith(\"'re\"):\n", " return word\n", " elif word.endswith(\"'ll\"):\n", " return word\n", " elif word.endswith(\"n't\"):\n", " return word\n", " elif word.endswith(\"'ve\"):\n", " return word\n", " elif word in [\"i'm\", \"'em\", \"o'brien\"]:\n", " return word\n", " else:\n", " return word.replace(\"'\", \"\")\n", "\n", "def fix_apos(text):\n", " words = [_check_apos(w) for w in text.split(\" \")]\n", " return \" \".join(words)" ] }, { "cell_type": "code", "execution_count": 4, "id": "ad9ec20c", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.197464Z", "iopub.status.busy": "2022-05-07T08:23:49.197091Z", "iopub.status.idle": "2022-05-07T08:23:49.206427Z", "shell.execute_reply": "2022-05-07T08:23:49.205485Z" }, "papermill": { "duration": 0.024203, "end_time": "2022-05-07T08:23:49.208449", "exception": false, "start_time": "2022-05-07T08:23:49.184246", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def normalise(text):\n", " if text[-1] == \".\":\n", " text = text[:-1]\n", " text = text.lower()\n", " words = []\n", " text = text.replace(\",\", \"\")\n", " for word in text.split(\" \"):\n", " if word in NORMS:\n", " words.append(NORMS[word])\n", " else:\n", " words.append(word)\n", " text = \" \".join(words)\n", " text = text.replace(\".\", \"\")\n", " text = text.replace(\"?\", \"\")\n", " text = text.replace(\"!\", \"\")\n", " text = text.replace(\":\", \"\")\n", " text = text.replace(\";\", \"\")\n", " text = text.replace(\"--\", \" \")\n", " text = text.replace(\" \", \" \")\n", " text = text.replace(\" - \", \" \")\n", " text = text.replace(\"to- morrow\", \"tomorrow\")\n", " text = fix_apos(text)\n", " text = text.replace(\"-\", \" \")\n", " return text.strip().upper()" ] }, { "cell_type": "code", "execution_count": 5, "id": "aec22c8a", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.233246Z", "iopub.status.busy": "2022-05-07T08:23:49.232952Z", "iopub.status.idle": "2022-05-07T08:23:49.275141Z", "shell.execute_reply": "2022-05-07T08:23:49.274337Z" }, "papermill": { "duration": 0.057432, "end_time": "2022-05-07T08:23:49.277432", "exception": false, "start_time": "2022-05-07T08:23:49.220000", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "data = {}\n", "with open(RAWTEXT) as inf:\n", " for line in inf.readlines():\n", " first_space = line.find(' ')\n", " first_quote = line.find('\"')\n", " last_quote = line.rfind('\"')\n", " id = line[first_space+1:first_quote].strip()\n", " text = line[first_quote+1:last_quote]\n", " data[id] = normalise(text)" ] }, { "cell_type": "code", "execution_count": 6, "id": "c4d12ef6", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.302618Z", "iopub.status.busy": "2022-05-07T08:23:49.302155Z", "iopub.status.idle": "2022-05-07T08:23:49.306841Z", "shell.execute_reply": "2022-05-07T08:23:49.306240Z" }, "papermill": { "duration": 0.019827, "end_time": "2022-05-07T08:23:49.308829", "exception": false, "start_time": "2022-05-07T08:23:49.289002", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "with open(\"text.tsv\", \"w\") as of:\n", " for id in data.keys():\n", " of.write(f\"{id}\\t{data[id]}\\n\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "62dc9132", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:49.333170Z", "iopub.status.busy": "2022-05-07T08:23:49.332878Z", "iopub.status.idle": "2022-05-07T08:23:58.857125Z", "shell.execute_reply": "2022-05-07T08:23:58.855696Z" }, "papermill": { "duration": 9.539562, "end_time": "2022-05-07T08:23:58.859914", "exception": false, "start_time": "2022-05-07T08:23:49.320352", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total: 4777.0\n" ] } ], "source": [ "from pathlib import Path\n", "import soundfile as sf\n", "\n", "total = 0\n", "WAVPATH = Path(\"../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/wav/\")\n", "with open(\"frames.tsv\", \"w\") as of:\n", " for wav in WAVPATH.glob(\"*.wav\"):\n", " frames, sr = sf.read(str(wav))\n", " assert sr == 16000\n", " total += len(frames)\n", " of.write(f\"{wav.stem}.wav\\t{len(frames)}\\n\")\n", "print(\"Total:\", total / 16000)" ] }, { "cell_type": "code", "execution_count": 8, "id": "71a8a65f", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:23:58.896455Z", "iopub.status.busy": "2022-05-07T08:23:58.895715Z", "iopub.status.idle": "2022-05-07T08:24:01.181186Z", "shell.execute_reply": "2022-05-07T08:24:01.179981Z" }, "papermill": { "duration": 2.308171, "end_time": "2022-05-07T08:24:01.183774", "exception": false, "start_time": "2022-05-07T08:23:58.875603", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "lines=!wc -l frames.tsv|awk '{print $1}'\n", "!tail -n 114 frames.tsv |head -n 57 > test.tsv\n", "!tail -n 114 frames.tsv |tail -n 57 > dev.tsv\n", "!head -n $((1138-114)) frames.tsv > train.tsv" ] }, { "cell_type": "code", "execution_count": 9, "id": "6c7413f1", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:24:01.210537Z", "iopub.status.busy": "2022-05-07T08:24:01.209625Z", "iopub.status.idle": "2022-05-07T08:24:01.215252Z", "shell.execute_reply": "2022-05-07T08:24:01.214668Z" }, "papermill": { "duration": 0.021169, "end_time": "2022-05-07T08:24:01.217174", "exception": false, "start_time": "2022-05-07T08:24:01.196005", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def do_fairseq(text):\n", " words = text.split(\" \")\n", " owords = [\" \".join(w) for w in words]\n", " return \" | \".join(owords) + \" |\"" ] }, { "cell_type": "code", "execution_count": 10, "id": "9342e47e", "metadata": { "execution": { "iopub.execute_input": "2022-05-07T08:24:01.242716Z", "iopub.status.busy": "2022-05-07T08:24:01.242299Z", "iopub.status.idle": "2022-05-07T08:24:01.257797Z", "shell.execute_reply": "2022-05-07T08:24:01.257054Z" }, "papermill": { "duration": 0.031049, "end_time": "2022-05-07T08:24:01.260092", "exception": false, "start_time": "2022-05-07T08:24:01.229043", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "for part in [\"test\", \"train\", \"dev\"]:\n", " ids = []\n", " with open(f\"{part}.ltr\", \"w\") as of, open(f\"{part}.tsv\") as inf:\n", " for line in inf.readlines():\n", " if \"\\t\" in line:\n", " parts = line.strip().split(\"\\t\")\n", " id = parts[0].replace(\".wav\", \"\")\n", " of.write(do_fairseq(data[id]) + \"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "papermill": { "default_parameters": {}, "duration": 23.168986, "end_time": "2022-05-07T08:24:01.994036", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2022-05-07T08:23:38.825050", "version": "2.3.4" } }, "nbformat": 4, "nbformat_minor": 5 }