Spaces:
Paused
Paused
Create create_dataset.py
Browse files- create_dataset.py +123 -0
create_dataset.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import librosa
|
| 4 |
+
from scipy.io import wavfile
|
| 5 |
+
import numpy as np
|
| 6 |
+
from demucs.pretrained import get_model, DEFAULT_MODEL
|
| 7 |
+
from demucs.apply import apply_model
|
| 8 |
+
import torch
|
| 9 |
+
import csv
|
| 10 |
+
import whisper
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def download_youtube_clip(video_identifier, start_time, end_time, output_filename, num_attempts=5, url_base="https://www.youtube.com/watch?v="):
|
| 14 |
+
status = False
|
| 15 |
+
|
| 16 |
+
output_path = Path(output_filename)
|
| 17 |
+
if output_path.exists():
|
| 18 |
+
return True, "Already Downloaded"
|
| 19 |
+
|
| 20 |
+
command = f"""
|
| 21 |
+
yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"
|
| 22 |
+
""".strip()
|
| 23 |
+
|
| 24 |
+
attempts = 0
|
| 25 |
+
while True:
|
| 26 |
+
try:
|
| 27 |
+
output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
|
| 28 |
+
except subprocess.CalledProcessError as err:
|
| 29 |
+
attempts += 1
|
| 30 |
+
if attempts == num_attempts:
|
| 31 |
+
return status, err.output
|
| 32 |
+
else:
|
| 33 |
+
break
|
| 34 |
+
|
| 35 |
+
status = output_path.exists()
|
| 36 |
+
return status, "Downloaded"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def split_long_audio(model, filepaths, character_name, save_dir="data_dir", out_sr=44100):
|
| 40 |
+
if isinstance(filepaths, str):
|
| 41 |
+
filepaths = [filepaths]
|
| 42 |
+
|
| 43 |
+
for file_idx, filepath in enumerate(filepaths):
|
| 44 |
+
|
| 45 |
+
save_path = Path(save_dir) / character_name
|
| 46 |
+
save_path.mkdir(exist_ok=True, parents=True)
|
| 47 |
+
|
| 48 |
+
print(f"Transcribing file {file_idx}: '{filepath}' to segments...")
|
| 49 |
+
result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5)
|
| 50 |
+
segments = result['segments']
|
| 51 |
+
|
| 52 |
+
wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)
|
| 53 |
+
wav, _ = librosa.effects.trim(wav, top_db=20)
|
| 54 |
+
peak = np.abs(wav).max()
|
| 55 |
+
if peak > 1.0:
|
| 56 |
+
wav = 0.98 * wav / peak
|
| 57 |
+
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)
|
| 58 |
+
wav2 /= max(wav2.max(), -wav2.min())
|
| 59 |
+
|
| 60 |
+
for i, seg in enumerate(segments):
|
| 61 |
+
start_time = seg['start']
|
| 62 |
+
end_time = seg['end']
|
| 63 |
+
wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]
|
| 64 |
+
wav_seg_name = f"{character_name}_{file_idx}_{i}.wav"
|
| 65 |
+
out_fpath = save_path / wav_seg_name
|
| 66 |
+
wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def extract_vocal_demucs(model, filename, out_filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
|
| 70 |
+
wav, sr = librosa.load(filename, mono=False, sr=sr)
|
| 71 |
+
wav = torch.tensor(wav)
|
| 72 |
+
ref = wav.mean(0)
|
| 73 |
+
wav = (wav - ref.mean()) / ref.std()
|
| 74 |
+
sources = apply_model(
|
| 75 |
+
model,
|
| 76 |
+
wav[None],
|
| 77 |
+
device=device,
|
| 78 |
+
shifts=shifts,
|
| 79 |
+
split=split,
|
| 80 |
+
overlap=overlap,
|
| 81 |
+
progress=True,
|
| 82 |
+
num_workers=jobs
|
| 83 |
+
)[0]
|
| 84 |
+
sources = sources * ref.std() + ref.mean()
|
| 85 |
+
|
| 86 |
+
wav = sources[-1]
|
| 87 |
+
wav = wav / max(1.01 * wav.abs().max(), 1)
|
| 88 |
+
wavfile.write(out_filename, rate=sr, data=wav.numpy().T)
|
| 89 |
+
return out_filename
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def main(
|
| 93 |
+
clips_csv_filepath = "theovon.csv",
|
| 94 |
+
character = "theovon",
|
| 95 |
+
do_extract_vocals = False,
|
| 96 |
+
whisper_size = "medium",
|
| 97 |
+
# Where raw yt clips will be downloaded to
|
| 98 |
+
dl_dir = "raw_data",
|
| 99 |
+
# Where actual data will be organized
|
| 100 |
+
data_dir = "prepared_data",
|
| 101 |
+
):
|
| 102 |
+
dl_path = Path(dl_dir) / character
|
| 103 |
+
dl_path.mkdir(exist_ok=True, parents=True)
|
| 104 |
+
if do_extract_vocals:
|
| 105 |
+
demucs_model = get_model(DEFAULT_MODEL)
|
| 106 |
+
|
| 107 |
+
with Path(clips_csv_filepath).open() as f:
|
| 108 |
+
reader = csv.DictReader(f)
|
| 109 |
+
for i, row in enumerate(reader):
|
| 110 |
+
outfile_path = dl_path / f"{character}_{i:04d}.wav"
|
| 111 |
+
download_youtube_clip(row['ytid'], row['start'], row['end'], outfile_path)
|
| 112 |
+
if do_extract_vocals:
|
| 113 |
+
extract_vocal_demucs(demucs_model, outfile_path, outfile_path)
|
| 114 |
+
|
| 115 |
+
filenames = sorted([str(x) for x in dl_path.glob("*.wav")])
|
| 116 |
+
whisper_model = whisper.load_model(whisper_size)
|
| 117 |
+
split_long_audio(whisper_model, filenames, character, data_dir)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
if __name__ == '__main__':
|
| 121 |
+
import json
|
| 122 |
+
cfg = json.loads(Path('dataset_config.json').read_text())
|
| 123 |
+
main(**cfg)
|