nateraw commited on
Commit
0651587
·
1 Parent(s): afbc95c

Create create_dataset.py (#1)

Browse files

- Create create_dataset.py (8dae101189acb95b8c4ad2bdec74180d1f19a98b)
- Create dataset_config.json (03e1a271e12993d584de8596cadc804d7762fa5a)
- Create data.csv (096405ce25499426c26c9ea67f6cb55d37e71358)

Files changed (3) hide show
  1. create_dataset.py +123 -0
  2. data.csv +3 -0
  3. dataset_config.json +8 -0
create_dataset.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ from pathlib import Path
3
+ import librosa
4
+ from scipy.io import wavfile
5
+ import numpy as np
6
+ from demucs.pretrained import get_model, DEFAULT_MODEL
7
+ from demucs.apply import apply_model
8
+ import torch
9
+ import csv
10
+ import whisper
11
+
12
+
13
+ def download_youtube_clip(video_identifier, start_time, end_time, output_filename, num_attempts=5, url_base="https://www.youtube.com/watch?v="):
14
+ status = False
15
+
16
+ output_path = Path(output_filename)
17
+ if output_path.exists():
18
+ return True, "Already Downloaded"
19
+
20
+ command = f"""
21
+ yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"
22
+ """.strip()
23
+
24
+ attempts = 0
25
+ while True:
26
+ try:
27
+ output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
28
+ except subprocess.CalledProcessError as err:
29
+ attempts += 1
30
+ if attempts == num_attempts:
31
+ return status, err.output
32
+ else:
33
+ break
34
+
35
+ status = output_path.exists()
36
+ return status, "Downloaded"
37
+
38
+
39
+ def split_long_audio(model, filepaths, character_name, save_dir="data_dir", out_sr=44100):
40
+ if isinstance(filepaths, str):
41
+ filepaths = [filepaths]
42
+
43
+ for file_idx, filepath in enumerate(filepaths):
44
+
45
+ save_path = Path(save_dir) / character_name
46
+ save_path.mkdir(exist_ok=True, parents=True)
47
+
48
+ print(f"Transcribing file {file_idx}: '{filepath}' to segments...")
49
+ result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5)
50
+ segments = result['segments']
51
+
52
+ wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)
53
+ wav, _ = librosa.effects.trim(wav, top_db=20)
54
+ peak = np.abs(wav).max()
55
+ if peak > 1.0:
56
+ wav = 0.98 * wav / peak
57
+ wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)
58
+ wav2 /= max(wav2.max(), -wav2.min())
59
+
60
+ for i, seg in enumerate(segments):
61
+ start_time = seg['start']
62
+ end_time = seg['end']
63
+ wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]
64
+ wav_seg_name = f"{character_name}_{file_idx}_{i}.wav"
65
+ out_fpath = save_path / wav_seg_name
66
+ wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))
67
+
68
+
69
+ def extract_vocal_demucs(model, filename, out_filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
70
+ wav, sr = librosa.load(filename, mono=False, sr=sr)
71
+ wav = torch.tensor(wav)
72
+ ref = wav.mean(0)
73
+ wav = (wav - ref.mean()) / ref.std()
74
+ sources = apply_model(
75
+ model,
76
+ wav[None],
77
+ device=device,
78
+ shifts=shifts,
79
+ split=split,
80
+ overlap=overlap,
81
+ progress=True,
82
+ num_workers=jobs
83
+ )[0]
84
+ sources = sources * ref.std() + ref.mean()
85
+
86
+ wav = sources[-1]
87
+ wav = wav / max(1.01 * wav.abs().max(), 1)
88
+ wavfile.write(out_filename, rate=sr, data=wav.numpy().T)
89
+ return out_filename
90
+
91
+
92
+ def main(
93
+ clips_csv_filepath = "theovon.csv",
94
+ character = "theovon",
95
+ do_extract_vocals = False,
96
+ whisper_size = "medium",
97
+ # Where raw yt clips will be downloaded to
98
+ dl_dir = "raw_data",
99
+ # Where actual data will be organized
100
+ data_dir = "prepared_data",
101
+ ):
102
+ dl_path = Path(dl_dir) / character
103
+ dl_path.mkdir(exist_ok=True, parents=True)
104
+ if do_extract_vocals:
105
+ demucs_model = get_model(DEFAULT_MODEL)
106
+
107
+ with Path(clips_csv_filepath).open() as f:
108
+ reader = csv.DictReader(f)
109
+ for i, row in enumerate(reader):
110
+ outfile_path = dl_path / f"{character}_{i:04d}.wav"
111
+ download_youtube_clip(row['ytid'], row['start'], row['end'], outfile_path)
112
+ if do_extract_vocals:
113
+ extract_vocal_demucs(demucs_model, outfile_path, outfile_path)
114
+
115
+ filenames = sorted([str(x) for x in dl_path.glob("*.wav")])
116
+ whisper_model = whisper.load_model(whisper_size)
117
+ split_long_audio(whisper_model, filenames, character, data_dir)
118
+
119
+
120
+ if __name__ == '__main__':
121
+ import json
122
+ cfg = json.loads(Path('dataset_config.json').read_text())
123
+ main(**cfg)
data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ytid,start,end
2
+ YYiQxHM0L-w,300,660
3
+ Ga-CcToGiUM,3105,3300
dataset_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clips_csv_filepath": "data.csv",
3
+ "character": "theovon",
4
+ "do_extract_vocals": false,
5
+ "whisper_size": "medium",
6
+ "dl_dir": "downloads",
7
+ "data_dir": "dataset_raw"
8
+ }