pseeth hugggof commited on
Commit
c91e8cc
0 Parent(s):

Duplicate from hugggof/vampnet

Browse files

Co-authored-by: Hugo Flores <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +36 -0
  2. .gitignore +184 -0
  3. .pre-commit-config.yaml +15 -0
  4. LICENSE +21 -0
  5. README.md +95 -0
  6. app.py +511 -0
  7. assets/example.wav +0 -0
  8. conf/c2f.yml +14 -0
  9. conf/generated-v0/berta-goldman-speech/c2f.yml +15 -0
  10. conf/generated-v0/berta-goldman-speech/coarse.yml +8 -0
  11. conf/generated-v0/berta-goldman-speech/interface.yml +5 -0
  12. conf/generated-v0/gamelan-xeno-canto/c2f.yml +17 -0
  13. conf/generated-v0/gamelan-xeno-canto/coarse.yml +10 -0
  14. conf/generated-v0/gamelan-xeno-canto/interface.yml +6 -0
  15. conf/generated-v0/nasralla/c2f.yml +15 -0
  16. conf/generated-v0/nasralla/coarse.yml +8 -0
  17. conf/generated-v0/nasralla/interface.yml +5 -0
  18. conf/generated/breaks-steps/c2f.yml +15 -0
  19. conf/generated/breaks-steps/coarse.yml +8 -0
  20. conf/generated/breaks-steps/interface.yml +7 -0
  21. conf/generated/bulgarian-tv-choir/c2f.yml +15 -0
  22. conf/generated/bulgarian-tv-choir/coarse.yml +8 -0
  23. conf/generated/bulgarian-tv-choir/interface.yml +7 -0
  24. conf/generated/dariacore/c2f.yml +15 -0
  25. conf/generated/dariacore/coarse.yml +8 -0
  26. conf/generated/dariacore/interface.yml +7 -0
  27. conf/generated/musica-bolero-marimba/c2f.yml +18 -0
  28. conf/generated/musica-bolero-marimba/coarse.yml +11 -0
  29. conf/generated/musica-bolero-marimba/interface.yml +8 -0
  30. conf/generated/panchos/c2f.yml +15 -0
  31. conf/generated/panchos/coarse.yml +8 -0
  32. conf/generated/panchos/interface.yml +7 -0
  33. conf/generated/titi-monkey/c2f.yml +15 -0
  34. conf/generated/titi-monkey/coarse.yml +8 -0
  35. conf/generated/titi-monkey/interface.yml +7 -0
  36. conf/generated/xeno-canto/c2f.yml +15 -0
  37. conf/generated/xeno-canto/coarse.yml +8 -0
  38. conf/generated/xeno-canto/interface.yml +7 -0
  39. conf/interface.yml +10 -0
  40. conf/lora/birds.yml +10 -0
  41. conf/lora/birdss.yml +12 -0
  42. conf/lora/constructions.yml +10 -0
  43. conf/lora/ella-baila-sola.yml +10 -0
  44. conf/lora/gas-station.yml +10 -0
  45. conf/lora/lora-is-this-charlie-parker.yml +10 -0
  46. conf/lora/lora.yml +22 -0
  47. conf/lora/underworld.yml +10 -0
  48. conf/lora/xeno-canto/c2f.yml +21 -0
  49. conf/lora/xeno-canto/coarse.yml +10 -0
  50. conf/vampnet-musdb-drums.yml +22 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .pth filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/env.sh
108
+ venv/
109
+ env.bak/
110
+ venv.bak/
111
+
112
+ # Spyder project settings
113
+ .spyderproject
114
+ .spyproject
115
+
116
+ # Rope project settings
117
+ .ropeproject
118
+
119
+ # mkdocs documentation
120
+ /site
121
+
122
+ # mypy
123
+ .mypy_cache/
124
+ .dmypy.json
125
+ dmypy.json
126
+
127
+ # Pyre type checker
128
+ .pyre/
129
+
130
+ # Files created by experiments
131
+ output/
132
+ snapshot/
133
+ *.m4a
134
+ notebooks/scratch.ipynb
135
+ notebooks/inspect.ipynb
136
+ notebooks/effects.ipynb
137
+ notebooks/*.ipynb
138
+ notebooks/*.gif
139
+ notebooks/*.wav
140
+ notebooks/*.mp4
141
+ *runs/
142
+ boards/
143
+ samples/
144
+ *.ipynb
145
+
146
+ results.json
147
+ metrics.csv
148
+ mprofile_*
149
+ mem.png
150
+
151
+ results/
152
+ mprofile*
153
+ *.png
154
+ # do not ignore the test wav file
155
+ !tests/audio/short_test_audio.wav
156
+ !tests/audio/output.wav
157
+ */.DS_Store
158
+ .DS_Store
159
+ env.sh
160
+ _codebraid/
161
+ **/*.html
162
+ **/*.exec.md
163
+ flagged/
164
+ log.txt
165
+ ckpt/
166
+ .syncthing*
167
+ tests/assets/
168
+ archived/
169
+
170
+ scratch/
171
+
172
+ runs-archive
173
+ lyrebird-audiotools
174
+ lyrebird-audio-codec
175
+ samples-*/**
176
+
177
+ gradio-outputs/
178
+ samples*/
179
+ models-all/
180
+ models.zip
181
+ audiotools/
182
+ descript-audio-codec/
183
+ # *.pth
184
+ .git-old
.pre-commit-config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/asottile/reorder_python_imports
3
+ rev: v2.5.0
4
+ hooks:
5
+ - id: reorder-python-imports
6
+ - repo: https://github.com/psf/black
7
+ rev: 23.1.0
8
+ hooks:
9
+ - id: black
10
+ language_version: python3
11
+ - repo: https://github.com/pre-commit/pre-commit-hooks
12
+ rev: v4.0.1
13
+ hooks:
14
+ - id: end-of-file-fixer
15
+ - id: trailing-whitespace
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Hugo Flores García and Prem Seetharaman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: 'VampNet: Music Generation with Masked Transformers'
3
+ emoji: 🤖
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: hugggof/vampnet
11
+ ---
12
+
13
+ # VampNet
14
+
15
+ This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
16
+
17
+ # Setting up
18
+
19
+ Requires Python 3.9 or later.
20
+
21
+
22
+ install VampNet
23
+
24
+ ```bash
25
+ git clone https://github.com/hugofloresgarcia/vampnet.git
26
+ pip install -e ./vampnet
27
+ ```
28
+
29
+ ## A note on argbind
30
+ This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files.
31
+ Config files are stored in the `conf/` folder.
32
+
33
+ ## Getting the Pretrained Models
34
+
35
+ ### Licensing for Pretrained Models:
36
+ The weights for the models are licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml). Likewise, any VampNet models fine-tuned on the pretrained models are also licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml).
37
+
38
+ Download the pretrained models from [this link](https://zenodo.org/record/8136545). Then, extract the models to the `models/` folder.
39
+
40
+
41
+ # Usage
42
+
43
+ ## Launching the Gradio Interface
44
+ You can launch a gradio UI to play with vampnet.
45
+
46
+ ```bash
47
+ python app.py --args.load conf/interface.yml --Interface.device cuda
48
+ ```
49
+
50
+ # Training / Fine-tuning
51
+
52
+ ## Training a model
53
+
54
+ To train a model, run the following script:
55
+
56
+ ```bash
57
+ python scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints
58
+ ```
59
+
60
+ You can edit `conf/vampnet.yml` to change the dataset paths or any training hyperparameters.
61
+
62
+ For coarse2fine models, you can use `conf/c2f.yml` as a starting configuration.
63
+
64
+ See `python scripts/exp/train.py -h` for a list of options.
65
+
66
+ ## Fine-tuning
67
+ To fine-tune a model, use the script in `scripts/exp/fine_tune.py` to generate 3 configuration files: `c2f.yml`, `coarse.yml`, and `interface.yml`.
68
+ The first two are used to fine-tune the coarse and fine models, respectively. The last one is used to launch the gradio interface.
69
+
70
+ ```bash
71
+ python scripts/exp/fine_tune.py "/path/to/audio1.mp3 /path/to/audio2/ /path/to/audio3.wav" <fine_tune_name>
72
+ ```
73
+
74
+ This will create a folder under `conf/<fine_tune_name>/` with the 3 configuration files.
75
+
76
+ The save_paths will be set to `runs/<fine_tune_name>/coarse` and `runs/<fine_tune_name>/c2f`.
77
+
78
+ launch the coarse job:
79
+ ```bash
80
+ python scripts/exp/train.py --args.load conf/<fine_tune_name>/coarse.yml
81
+ ```
82
+
83
+ this will save the coarse model to `runs/<fine_tune_name>/coarse/ckpt/best/`.
84
+
85
+ launch the c2f job:
86
+ ```bash
87
+ python scripts/exp/train.py --args.load conf/<fine_tune_name>/c2f.yml
88
+ ```
89
+
90
+ launch the interface:
91
+ ```bash
92
+ python demo.py --args.load conf/generated/<fine_tune_name>/interface.yml
93
+ ```
94
+
95
+
app.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Tuple
3
+ import yaml
4
+ import tempfile
5
+ import uuid
6
+ import shutil
7
+ from dataclasses import dataclass, asdict
8
+
9
+ import numpy as np
10
+ import audiotools as at
11
+ import argbind
12
+ import torch
13
+
14
+ import gradio as gr
15
+ from vampnet.interface import Interface
16
+ from vampnet import mask as pmask
17
+
18
+ # Interface = argbind.bind(Interface)
19
+ # AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
20
+
21
+ interface = Interface(
22
+ coarse_ckpt="./models/vampnet/coarse.pth",
23
+ coarse2fine_ckpt="./models/vampnet/c2f.pth",
24
+ codec_ckpt="./models/vampnet/codec.pth",
25
+ device="cuda" if torch.cuda.is_available() else "cpu",
26
+ )
27
+
28
+ # loader = AudioLoader()
29
+ print(f"interface device is {interface.device}")
30
+
31
+ # dataset = at.data.datasets.AudioDataset(
32
+ # loader,
33
+ # sample_rate=interface.codec.sample_rate,
34
+ # duration=interface.coarse.chunk_size_s,
35
+ # n_examples=5000,
36
+ # without_replacement=True,
37
+ # )
38
+
39
+ OUT_DIR = Path("gradio-outputs")
40
+ OUT_DIR.mkdir(exist_ok=True, parents=True)
41
+
42
+
43
+ def load_audio(file):
44
+ print(file)
45
+ filepath = file.name
46
+ sig = at.AudioSignal.salient_excerpt(
47
+ filepath,
48
+ duration=interface.coarse.chunk_size_s
49
+ )
50
+ sig = interface.preprocess(sig)
51
+
52
+ out_dir = OUT_DIR / str(uuid.uuid4())
53
+ out_dir.mkdir(parents=True, exist_ok=True)
54
+ sig.write(out_dir / "input.wav")
55
+ return sig.path_to_file
56
+
57
+
58
+ def load_example_audio():
59
+ return "./assets/example.wav"
60
+
61
+
62
+ def _vamp(data, return_mask=False):
63
+ # remove any old files in the output directory (from previous runs)
64
+ shutil.rmtree(OUT_DIR)
65
+ OUT_DIR.mkdir()
66
+
67
+ out_dir = OUT_DIR / str(uuid.uuid4())
68
+ out_dir.mkdir()
69
+ sig = at.AudioSignal(data[input_audio])
70
+
71
+ z = interface.encode(sig)
72
+
73
+ ncc = data[n_conditioning_codebooks]
74
+
75
+ # build the mask
76
+ mask = pmask.linear_random(z, data[rand_mask_intensity])
77
+ mask = pmask.mask_and(
78
+ mask, pmask.inpaint(
79
+ z,
80
+ interface.s2t(data[prefix_s]),
81
+ interface.s2t(data[suffix_s])
82
+ )
83
+ )
84
+ mask = pmask.mask_and(
85
+ mask, pmask.periodic_mask(
86
+ z,
87
+ data[periodic_p],
88
+ data[periodic_w],
89
+ random_roll=True
90
+ )
91
+ )
92
+ if data[onset_mask_width] > 0:
93
+ mask = pmask.mask_or(
94
+ mask, pmask.onset_mask(sig, z, interface, width=data[onset_mask_width])
95
+ )
96
+ if data[beat_mask_width] > 0:
97
+ beat_mask = interface.make_beat_mask(
98
+ sig,
99
+ after_beat_s=(data[beat_mask_width]/1000),
100
+ mask_upbeats=not data[beat_mask_downbeats],
101
+ )
102
+ mask = pmask.mask_and(mask, beat_mask)
103
+
104
+ # these should be the last two mask ops
105
+ mask = pmask.dropout(mask, data[dropout])
106
+ mask = pmask.codebook_unmask(mask, ncc)
107
+
108
+
109
+ print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]}, use coarse2fine {data[use_coarse2fine]}")
110
+ # save the mask as a txt file
111
+ np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
112
+
113
+ zv, mask_z = interface.coarse_vamp(
114
+ z,
115
+ mask=mask,
116
+ sampling_steps=data[num_steps],
117
+ temperature=data[temp]*10,
118
+ return_mask=True,
119
+ typical_filtering=data[typical_filtering],
120
+ typical_mass=data[typical_mass],
121
+ typical_min_tokens=data[typical_min_tokens],
122
+ gen_fn=interface.coarse.generate,
123
+ )
124
+
125
+ if use_coarse2fine:
126
+ zv = interface.coarse_to_fine(zv, temperature=data[temp])
127
+
128
+ sig = interface.to_signal(zv).cpu()
129
+ print("done")
130
+
131
+
132
+
133
+ sig.write(out_dir / "output.wav")
134
+
135
+ if return_mask:
136
+ mask = interface.to_signal(mask_z).cpu()
137
+ mask.write(out_dir / "mask.wav")
138
+ return sig.path_to_file, mask.path_to_file
139
+ else:
140
+ return sig.path_to_file
141
+
142
+ def vamp(data):
143
+ return _vamp(data, return_mask=True)
144
+
145
+ def api_vamp(data):
146
+ return _vamp(data, return_mask=False)
147
+
148
+ def save_vamp(data):
149
+ out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
150
+ out_dir.mkdir(parents=True, exist_ok=True)
151
+
152
+ sig_in = at.AudioSignal(data[input_audio])
153
+ sig_out = at.AudioSignal(data[output_audio])
154
+
155
+ sig_in.write(out_dir / "input.wav")
156
+ sig_out.write(out_dir / "output.wav")
157
+
158
+ _data = {
159
+ "temp": data[temp],
160
+ "prefix_s": data[prefix_s],
161
+ "suffix_s": data[suffix_s],
162
+ "rand_mask_intensity": data[rand_mask_intensity],
163
+ "num_steps": data[num_steps],
164
+ "notes": data[notes_text],
165
+ "periodic_period": data[periodic_p],
166
+ "periodic_width": data[periodic_w],
167
+ "n_conditioning_codebooks": data[n_conditioning_codebooks],
168
+ "use_coarse2fine": data[use_coarse2fine],
169
+ "stretch_factor": data[stretch_factor],
170
+ }
171
+
172
+ # save with yaml
173
+ with open(out_dir / "data.yaml", "w") as f:
174
+ yaml.dump(_data, f)
175
+
176
+ import zipfile
177
+ zip_path = out_dir.with_suffix(".zip")
178
+ with zipfile.ZipFile(zip_path, "w") as zf:
179
+ for file in out_dir.iterdir():
180
+ zf.write(file, file.name)
181
+
182
+ return f"saved! your save code is {out_dir.stem}", zip_path
183
+
184
+
185
+ with gr.Blocks() as demo:
186
+
187
+ with gr.Row():
188
+ with gr.Column():
189
+ gr.Markdown("# VampNet Audio Vamping")
190
+ gr.Markdown("""## Description:
191
+ This is a demo of the VampNet, a generative audio model that transforms the input audio based on the chosen settings.
192
+ You can control the extent and nature of variation with a set of manual controls and presets.
193
+ Use this interface to experiment with different mask settings and explore the audio outputs.
194
+ """)
195
+
196
+ gr.Markdown("""
197
+ ## Instructions:
198
+ 1. You can start by uploading some audio, or by loading the example audio.
199
+ 2. Choose a preset for the vamp operation, or manually adjust the controls to customize the mask settings.
200
+ 3. Click the "generate (vamp)!!!" button to apply the vamp operation. Listen to the output audio.
201
+ 4. Optionally, you can add some notes and save the result.
202
+ 5. You can also use the output as the new input and continue experimenting!
203
+ """)
204
+ with gr.Row():
205
+ with gr.Column():
206
+
207
+
208
+ manual_audio_upload = gr.File(
209
+ label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
210
+ file_types=["audio"]
211
+ )
212
+ load_example_audio_button = gr.Button("or load example audio")
213
+
214
+ input_audio = gr.Audio(
215
+ label="input audio",
216
+ interactive=False,
217
+ type="filepath",
218
+ )
219
+
220
+ audio_mask = gr.Audio(
221
+ label="audio mask (listen to this to hear the mask hints)",
222
+ interactive=False,
223
+ type="filepath",
224
+ )
225
+
226
+ # connect widgets
227
+ load_example_audio_button.click(
228
+ fn=load_example_audio,
229
+ inputs=[],
230
+ outputs=[ input_audio]
231
+ )
232
+
233
+ manual_audio_upload.change(
234
+ fn=load_audio,
235
+ inputs=[manual_audio_upload],
236
+ outputs=[ input_audio]
237
+ )
238
+
239
+ # mask settings
240
+ with gr.Column():
241
+
242
+
243
+ presets = {
244
+ "unconditional": {
245
+ "periodic_p": 0,
246
+ "onset_mask_width": 0,
247
+ "beat_mask_width": 0,
248
+ "beat_mask_downbeats": False,
249
+ },
250
+ "slight periodic variation": {
251
+ "periodic_p": 5,
252
+ "onset_mask_width": 5,
253
+ "beat_mask_width": 0,
254
+ "beat_mask_downbeats": False,
255
+ },
256
+ "moderate periodic variation": {
257
+ "periodic_p": 13,
258
+ "onset_mask_width": 5,
259
+ "beat_mask_width": 0,
260
+ "beat_mask_downbeats": False,
261
+ },
262
+ "strong periodic variation": {
263
+ "periodic_p": 17,
264
+ "onset_mask_width": 5,
265
+ "beat_mask_width": 0,
266
+ "beat_mask_downbeats": False,
267
+ },
268
+ "very strong periodic variation": {
269
+ "periodic_p": 21,
270
+ "onset_mask_width": 5,
271
+ "beat_mask_width": 0,
272
+ "beat_mask_downbeats": False,
273
+ },
274
+ "beat-driven variation": {
275
+ "periodic_p": 0,
276
+ "onset_mask_width": 0,
277
+ "beat_mask_width": 50,
278
+ "beat_mask_downbeats": False,
279
+ },
280
+ "beat-driven variation (downbeats only)": {
281
+ "periodic_p": 0,
282
+ "onset_mask_width": 0,
283
+ "beat_mask_width": 50,
284
+ "beat_mask_downbeats": True,
285
+ },
286
+ "beat-driven variation (downbeats only, strong)": {
287
+ "periodic_p": 0,
288
+ "onset_mask_width": 0,
289
+ "beat_mask_width": 20,
290
+ "beat_mask_downbeats": True,
291
+ },
292
+ }
293
+
294
+ preset = gr.Dropdown(
295
+ label="preset",
296
+ choices=list(presets.keys()),
297
+ value="strong periodic variation",
298
+ )
299
+ load_preset_button = gr.Button("load_preset")
300
+
301
+ with gr.Accordion("manual controls", open=True):
302
+ periodic_p = gr.Slider(
303
+ label="periodic prompt (0 - unconditional, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
304
+ minimum=0,
305
+ maximum=128,
306
+ step=1,
307
+ value=3,
308
+ )
309
+
310
+
311
+ onset_mask_width = gr.Slider(
312
+ label="onset mask width (multiplies with the periodic mask, 1 step ~= 10milliseconds) ",
313
+ minimum=0,
314
+ maximum=20,
315
+ step=1,
316
+ value=5,
317
+ )
318
+
319
+ beat_mask_width = gr.Slider(
320
+ label="beat mask width (in milliseconds)",
321
+ minimum=0,
322
+ maximum=200,
323
+ value=0,
324
+ )
325
+ beat_mask_downbeats = gr.Checkbox(
326
+ label="beat mask downbeats only?",
327
+ value=False
328
+ )
329
+
330
+
331
+ with gr.Accordion("extras ", open=False):
332
+ rand_mask_intensity = gr.Slider(
333
+ label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
334
+ minimum=0.0,
335
+ maximum=1.0,
336
+ value=1.0
337
+ )
338
+
339
+ periodic_w = gr.Slider(
340
+ label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
341
+ minimum=1,
342
+ maximum=20,
343
+ step=1,
344
+ value=1,
345
+ )
346
+ n_conditioning_codebooks = gr.Number(
347
+ label="number of conditioning codebooks. probably 0",
348
+ value=0,
349
+ precision=0,
350
+ )
351
+
352
+ stretch_factor = gr.Slider(
353
+ label="time stretch factor",
354
+ minimum=0,
355
+ maximum=64,
356
+ step=1,
357
+ value=1,
358
+ )
359
+
360
+ preset_outputs = {
361
+ periodic_p,
362
+ onset_mask_width,
363
+ beat_mask_width,
364
+ beat_mask_downbeats,
365
+ }
366
+
367
+ def load_preset(_preset):
368
+ return tuple(presets[_preset].values())
369
+
370
+ load_preset_button.click(
371
+ fn=load_preset,
372
+ inputs=[preset],
373
+ outputs=preset_outputs
374
+ )
375
+
376
+
377
+ with gr.Accordion("prefix/suffix prompts", open=False):
378
+ prefix_s = gr.Slider(
379
+ label="prefix hint length (seconds)",
380
+ minimum=0.0,
381
+ maximum=10.0,
382
+ value=0.0
383
+ )
384
+ suffix_s = gr.Slider(
385
+ label="suffix hint length (seconds)",
386
+ minimum=0.0,
387
+ maximum=10.0,
388
+ value=0.0
389
+ )
390
+
391
+ temp = gr.Slider(
392
+ label="temperature",
393
+ minimum=0.0,
394
+ maximum=10.0,
395
+ value=0.8
396
+ )
397
+
398
+
399
+
400
+ with gr.Accordion("sampling settings", open=False):
401
+ typical_filtering = gr.Checkbox(
402
+ label="typical filtering ",
403
+ value=False
404
+ )
405
+ typical_mass = gr.Slider(
406
+ label="typical mass (should probably stay between 0.1 and 0.5)",
407
+ minimum=0.01,
408
+ maximum=0.99,
409
+ value=0.15
410
+ )
411
+ typical_min_tokens = gr.Slider(
412
+ label="typical min tokens (should probably stay between 1 and 256)",
413
+ minimum=1,
414
+ maximum=256,
415
+ step=1,
416
+ value=64
417
+ )
418
+
419
+ use_coarse2fine = gr.Checkbox(
420
+ label="use coarse2fine",
421
+ value=True
422
+ )
423
+
424
+ num_steps = gr.Slider(
425
+ label="number of steps (should normally be between 12 and 36)",
426
+ minimum=1,
427
+ maximum=128,
428
+ step=1,
429
+ value=36
430
+ )
431
+
432
+ dropout = gr.Slider(
433
+ label="mask dropout",
434
+ minimum=0.0,
435
+ maximum=1.0,
436
+ step=0.01,
437
+ value=0.0
438
+ )
439
+
440
+
441
+ # mask settings
442
+ with gr.Column():
443
+ vamp_button = gr.Button("generate (vamp)!!!")
444
+ output_audio = gr.Audio(
445
+ label="output audio",
446
+ interactive=False,
447
+ type="filepath"
448
+ )
449
+
450
+ notes_text = gr.Textbox(
451
+ label="type any notes about the generated audio here",
452
+ value="",
453
+ interactive=True
454
+ )
455
+ save_button = gr.Button("save vamp")
456
+ download_file = gr.File(
457
+ label="vamp to download will appear here",
458
+ interactive=False
459
+ )
460
+ use_as_input_button = gr.Button("use output as input")
461
+
462
+ thank_you = gr.Markdown("")
463
+
464
+
465
+ _inputs = {
466
+ input_audio,
467
+ num_steps,
468
+ temp,
469
+ prefix_s, suffix_s,
470
+ rand_mask_intensity,
471
+ periodic_p, periodic_w,
472
+ n_conditioning_codebooks,
473
+ dropout,
474
+ use_coarse2fine,
475
+ stretch_factor,
476
+ onset_mask_width,
477
+ typical_filtering,
478
+ typical_mass,
479
+ typical_min_tokens,
480
+ beat_mask_width,
481
+ beat_mask_downbeats
482
+ }
483
+
484
+ # connect widgets
485
+ vamp_button.click(
486
+ fn=vamp,
487
+ inputs=_inputs,
488
+ outputs=[output_audio, audio_mask],
489
+ )
490
+
491
+ api_vamp_button = gr.Button("api vamp", visible=False)
492
+ api_vamp_button.click(
493
+ fn=api_vamp,
494
+ inputs=_inputs,
495
+ outputs=[output_audio],
496
+ api_name="vamp"
497
+ )
498
+
499
+ use_as_input_button.click(
500
+ fn=lambda x: x,
501
+ inputs=[output_audio],
502
+ outputs=[input_audio]
503
+ )
504
+
505
+ save_button.click(
506
+ fn=save_vamp,
507
+ inputs=_inputs | {notes_text, output_audio},
508
+ outputs=[thank_you, download_file]
509
+ )
510
+
511
+ demo.queue().launch()
assets/example.wav ADDED
Binary file (883 kB). View file
 
conf/c2f.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ VampNet.n_codebooks: 14
5
+ VampNet.n_conditioning_codebooks: 4
6
+
7
+ VampNet.embedding_dim: 1280
8
+ VampNet.n_layers: 16
9
+ VampNet.n_heads: 20
10
+
11
+ AudioDataset.duration: 3.0
12
+
13
+
14
+ AudioDataset.loudness_cutoff: -40.0
conf/generated-v0/berta-goldman-speech/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ save_path: ./runs/berta-goldman-speech/c2f
12
+ train/AudioLoader.sources:
13
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
14
+ val/AudioLoader.sources:
15
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
conf/generated-v0/berta-goldman-speech/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ save_path: ./runs/berta-goldman-speech/coarse
5
+ train/AudioLoader.sources:
6
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
7
+ val/AudioLoader.sources:
8
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
conf/generated-v0/berta-goldman-speech/interface.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
3
+ Interface.coarse2fine_ckpt: ./runs/berta-goldman-speech/c2f/best/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/berta-goldman-speech/coarse/best/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated-v0/gamelan-xeno-canto/c2f.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ save_path: ./runs/gamelan-xeno-canto/c2f
12
+ train/AudioLoader.sources:
13
+ - /media/CHONK/hugo/loras/Sound Tracker - Gamelan (Indonesia) [UEWCCSuHsuQ].mp3
14
+ - /media/CHONK/hugo/loras/xeno-canto-2
15
+ val/AudioLoader.sources:
16
+ - /media/CHONK/hugo/loras/Sound Tracker - Gamelan (Indonesia) [UEWCCSuHsuQ].mp3
17
+ - /media/CHONK/hugo/loras/xeno-canto-2
conf/generated-v0/gamelan-xeno-canto/coarse.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ save_path: ./runs/gamelan-xeno-canto/coarse
5
+ train/AudioLoader.sources:
6
+ - /media/CHONK/hugo/loras/Sound Tracker - Gamelan (Indonesia) [UEWCCSuHsuQ].mp3
7
+ - /media/CHONK/hugo/loras/xeno-canto-2
8
+ val/AudioLoader.sources:
9
+ - /media/CHONK/hugo/loras/Sound Tracker - Gamelan (Indonesia) [UEWCCSuHsuQ].mp3
10
+ - /media/CHONK/hugo/loras/xeno-canto-2
conf/generated-v0/gamelan-xeno-canto/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/loras/Sound Tracker - Gamelan (Indonesia) [UEWCCSuHsuQ].mp3
3
+ - /media/CHONK/hugo/loras/xeno-canto-2
4
+ Interface.coarse2fine_ckpt: ./runs/gamelan-xeno-canto/c2f/best/vampnet/weights.pth
5
+ Interface.coarse_ckpt: ./runs/gamelan-xeno-canto/coarse/best/vampnet/weights.pth
6
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated-v0/nasralla/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ save_path: ./runs/nasralla/c2f
12
+ train/AudioLoader.sources:
13
+ - /media/CHONK/hugo/nasralla
14
+ val/AudioLoader.sources:
15
+ - /media/CHONK/hugo/nasralla
conf/generated-v0/nasralla/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ save_path: ./runs/nasralla/coarse
5
+ train/AudioLoader.sources:
6
+ - /media/CHONK/hugo/nasralla
7
+ val/AudioLoader.sources:
8
+ - /media/CHONK/hugo/nasralla
conf/generated-v0/nasralla/interface.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/nasralla
3
+ Interface.coarse2fine_ckpt: ./runs/nasralla/c2f/best/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/nasralla/coarse/best/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/breaks-steps/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/breaks-steps/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/breaks-steps
15
+ val/AudioLoader.sources: *id001
conf/generated/breaks-steps/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/breaks-steps/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/breaks-steps
8
+ val/AudioLoader.sources: *id001
conf/generated/breaks-steps/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/breaks-steps
3
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/breaks-steps/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/breaks-steps/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/bulgarian-tv-choir/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/bulgarian-tv-choir/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
15
+ val/AudioLoader.sources: *id001
conf/generated/bulgarian-tv-choir/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/bulgarian-tv-choir/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
8
+ val/AudioLoader.sources: *id001
conf/generated/bulgarian-tv-choir/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
3
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/bulgarian-tv-choir/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/bulgarian-tv-choir/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/dariacore/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/dariacore/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/loras/dariacore
15
+ val/AudioLoader.sources: *id001
conf/generated/dariacore/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/dariacore/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/loras/dariacore
8
+ val/AudioLoader.sources: *id001
conf/generated/dariacore/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/loras/dariacore
3
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/dariacore/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/dariacore/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/musica-bolero-marimba/c2f.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/musica-bolero-marimba/c2f
13
+ train/AudioLoader.sources:
14
+ - /media/CHONK/hugo/loras/boleros
15
+ - /media/CHONK/hugo/loras/marimba-honduras
16
+ val/AudioLoader.sources:
17
+ - /media/CHONK/hugo/loras/boleros
18
+ - /media/CHONK/hugo/loras/marimba-honduras
conf/generated/musica-bolero-marimba/coarse.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/musica-bolero-marimba/coarse
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/loras/boleros
8
+ - /media/CHONK/hugo/loras/marimba-honduras
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/loras/boleros
11
+ - /media/CHONK/hugo/loras/marimba-honduras
conf/generated/musica-bolero-marimba/interface.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/loras/boleros
3
+ - /media/CHONK/hugo/loras/marimba-honduras
4
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
5
+ Interface.coarse2fine_lora_ckpt: ./runs/musica-bolero-marimba/c2f/latest/lora.pth
6
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
7
+ Interface.coarse_lora_ckpt: ./runs/musica-bolero-marimba/coarse/latest/lora.pth
8
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/panchos/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/panchos/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/loras/panchos/
15
+ val/AudioLoader.sources: *id001
conf/generated/panchos/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/panchos/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/loras/panchos/
8
+ val/AudioLoader.sources: *id001
conf/generated/panchos/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/loras/panchos/
3
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/panchos/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/panchos/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/titi-monkey/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/titi-monkey/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/loras/titi-monkey.mp3
15
+ val/AudioLoader.sources: *id001
conf/generated/titi-monkey/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/titi-monkey/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/loras/titi-monkey.mp3
8
+ val/AudioLoader.sources: *id001
conf/generated/titi-monkey/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/loras/titi-monkey.mp3
3
+ Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/titi-monkey/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/titi-monkey/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/xeno-canto/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/spotdl/c2f.pth
12
+ save_path: ./runs/xeno-canto/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - /media/CHONK/hugo/loras/xeno-canto-2/
15
+ val/AudioLoader.sources: *id001
conf/generated/xeno-canto/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/spotdl/coarse.pth
5
+ save_path: ./runs/xeno-canto/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - /media/CHONK/hugo/loras/xeno-canto-2/
8
+ val/AudioLoader.sources: *id001
conf/generated/xeno-canto/interface.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - /media/CHONK/hugo/loras/xeno-canto-2/
3
+ Interface.coarse2fine_ckpt: ./mod els/spotdl/c2f.pth
4
+ Interface.coarse2fine_lora_ckpt: ./runs/xeno-canto/c2f/latest/lora.pth
5
+ Interface.coarse_ckpt: ./models/spotdl/coarse.pth
6
+ Interface.coarse_lora_ckpt: ./runs/xeno-canto/coarse/latest/lora.pth
7
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/interface.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Interface.coarse_ckpt: ./models/vampnet/coarse.pth
2
+ Interface.coarse2fine_ckpt: ./models/vampnet/c2f.pth
3
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
4
+ Interface.coarse_chunk_size_s: 10
5
+ Interface.coarse2fine_chunk_size_s: 3
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
7
+
8
+ # AudioLoader.sources:
9
+ # - /media/CHONK/null
10
+
conf/lora/birds.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/birds
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/birds
conf/lora/birdss.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/birds
8
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
9
+
10
+ val/AudioLoader.sources:
11
+ - /media/CHONK/hugo/spotdl/subsets/birds
12
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
conf/lora/constructions.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
conf/lora/ella-baila-sola.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/ella-baila-sola.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/ella-baila-sola.mp3
conf/lora/gas-station.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/gas-station-sushi.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/gas-station-sushi.mp3
conf/lora/lora-is-this-charlie-parker.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
conf/lora/lora.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioDataset.n_examples: 10000000
7
+
8
+ val/AudioDataset.n_examples: 10
9
+
10
+
11
+ NoamScheduler.warmup: 500
12
+
13
+ batch_size: 7
14
+ num_workers: 7
15
+ epoch_length: 100
16
+ save_audio_epochs: 10
17
+
18
+ AdamW.lr: 0.0001
19
+
20
+ # let's us organize sound classes into folders and choose from those sound classes uniformly
21
+ AudioDataset.without_replacement: False
22
+ max_epochs: 500
conf/lora/underworld.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/underworld.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/underworld.mp3
conf/lora/xeno-canto/c2f.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/xeno-canto-2
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/xeno-canto-2
11
+
12
+
13
+ VampNet.n_codebooks: 14
14
+ VampNet.n_conditioning_codebooks: 4
15
+
16
+ VampNet.embedding_dim: 1280
17
+ VampNet.n_layers: 16
18
+ VampNet.n_heads: 20
19
+
20
+ AudioDataset.duration: 3.0
21
+ AudioDataset.loudness_cutoff: -40.0
conf/lora/xeno-canto/coarse.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/xeno-canto-2
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/xeno-canto-2
conf/vampnet-musdb-drums.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ VampNet.embedding_dim: 512
5
+ VampNet.n_layers: 12
6
+ VampNet.n_heads: 8
7
+
8
+ AudioDataset.duration: 12.0
9
+
10
+ train/AudioDataset.n_examples: 10000000
11
+ train/AudioLoader.sources:
12
+ - /data/musdb18hq/train/**/*drums.wav
13
+
14
+
15
+ val/AudioDataset.n_examples: 500
16
+ val/AudioLoader.sources:
17
+ - /data/musdb18hq/test/**/*drums.wav
18
+
19
+
20
+ test/AudioDataset.n_examples: 1000
21
+ test/AudioLoader.sources:
22
+ - /data/musdb18hq/test/**/*drums.wav