Spaces:

trttung1610
/

musicgen

Build error

App Files Files Community

trttung1610 commited on Aug 14, 2023

Commit

26246bd

1 Parent(s): 42fc3a0

Upload 233 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +63 -0
CHANGELOG.md +28 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +35 -0
Dockerfile +26 -0
LICENSE +21 -0
LICENSE_weights +399 -0
MANIFEST.in +9 -0
Makefile +40 -0
README.md +7 -6
app.py +463 -0
app_v2.py +1839 -0
assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 +0 -0
assets/bach.mp3 +0 -0
assets/bolero_ravel.mp3 +0 -0
assets/sirens_and_a_humming_engine_approach_and_pass.mp3 +0 -0
audiocraft/__init__.py +26 -0
audiocraft/adversarial/__init__.py +22 -0
audiocraft/adversarial/discriminators/__init__.py +10 -0
audiocraft/adversarial/discriminators/base.py +34 -0
audiocraft/adversarial/discriminators/mpd.py +106 -0
audiocraft/adversarial/discriminators/msd.py +126 -0
audiocraft/adversarial/discriminators/msstftd.py +134 -0
audiocraft/adversarial/losses.py +228 -0
audiocraft/data/__init__.py +10 -0
audiocraft/data/audio.py +216 -0
audiocraft/data/audio_dataset.py +587 -0
audiocraft/data/audio_utils.py +177 -0
audiocraft/data/info_audio_dataset.py +110 -0
audiocraft/data/music_dataset.py +270 -0
audiocraft/data/sound_dataset.py +330 -0
audiocraft/data/zip.py +76 -0
audiocraft/environment.py +176 -0
audiocraft/grids/__init__.py +6 -0
audiocraft/grids/_base_explorers.py +80 -0
audiocraft/grids/audiogen/__init__.py +6 -0
audiocraft/grids/audiogen/audiogen_base_16khz.py +23 -0
audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py +68 -0
audiocraft/grids/compression/__init__.py +6 -0
audiocraft/grids/compression/_explorers.py +55 -0
audiocraft/grids/compression/debug.py +31 -0
audiocraft/grids/compression/encodec_audiogen_16khz.py +29 -0
audiocraft/grids/compression/encodec_base_24khz.py +28 -0
audiocraft/grids/compression/encodec_musicgen_32khz.py +34 -0
audiocraft/grids/diffusion/4_bands_base_32khz.py +27 -0
audiocraft/grids/diffusion/__init__.py +6 -0
audiocraft/grids/diffusion/_explorers.py +66 -0
audiocraft/grids/musicgen/__init__.py +6 -0
audiocraft/grids/musicgen/_explorers.py +93 -0
audiocraft/grids/musicgen/musicgen_base_32khz.py +43 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,63 @@

+# Byte-compiled / optimized / DLL files
+__pycache__
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# macOS dir files
+.DS_Store
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.ipynb_checkpoints
+# Tests and linter
+.pytest_cache/
+.mypy_cache/
+.coverage
+# docs
+/api_docs
+# dotenv
+.env
+.envrc
+# virtualenv
+.venv
+venv/
+ENV/
+# egs with manifest files
+egs/*
+!egs/example
+# local datasets
+dataset/*
+!dataset/example
+# personal notebooks & scripts
+*/local_scripts
+*/notes
+.vscode/
+/notebooks
+/local_scripts
+/notes
+/cache

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [1.0.0] - 2023-08-02
+Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
+Added pretrained model for AudioGen and MultiBandDiffusion.
+## [0.0.2] - 2023-08-01
+Improved demo, fixed top p (thanks @jnordberg).
+Compressor tanh on output to avoid clipping with some style (especially piano).
+Now repeating the conditioning periodically if it is too short.
+More options when launching Gradio app locally (thanks @ashleykleynhans).
+Testing out PyTorch 2.0 memory efficient attention.
+Added extended generation (infinite length) by slowly moving the windows.
+Note that other implementations exist: https://github.com/camenduru/MusicGen-colab.
+## [0.0.1] - 2023-06-09
+Initial release, with model evaluation only.

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Contributing to AudioCraft
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+AudioCraft is the implementation of a research paper.
+Therefore, we do not plan on accepting many pull requests for new features.
+We certainly welcome them for bug fixes.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to encodec, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM nvidia/cuda:11.8.0-base-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PYTHONIOENCODING=UTF-8
+RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt apt update &&\
+    apt install -y \
+    wget \
+    git \
+    pkg-config \
+    python3 \
+    python3-pip \
+    python-is-python3 \
+    ffmpeg \
+    libnvrtc11.2 \
+    libtcmalloc-minimal4
+RUN useradd -m -u 1000 ac
+RUN --mount=type=cache,target=/root/.cache python -m pip install --upgrade pip wheel
+ENV TORCH_COMMAND="pip install torch==2.0.1+cu118 torchaudio --extra-index-url https://download.pytorch.org/whl/cu118"
+RUN --mount=type=cache,target=/root/.cache python -m $TORCH_COMMAND
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnvrtc.so.11.2 /usr/lib/x86_64-linux-gnu/libnvrtc.so
+USER 1000
+RUN mkdir ~/.cache
+RUN --mount=type=cache,target=/home/ac/.cache --mount=source=.,target=/home/ac/audiocraft python -m pip install -r /home/ac/audiocraft/requirements.txt
+WORKDIR /home/ac/audiocraft

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE_weights ADDED Viewed

	@@ -0,0 +1,399 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,9 @@

+include Makefile
+include LICENSE
+include LICENSE_weights
+include *.md
+include *.ini
+include requirements.txt
+include audiocraft/py.typed
+include assets/*.mp3
+recursive-include conf *.yaml

Makefile ADDED Viewed

	@@ -0,0 +1,40 @@

+INTEG=AUDIOCRAFT_DORA_DIR="/tmp/magma_$(USER)" python3 -m dora -v run --clear device=cpu dataset.num_workers=0 optim.epochs=1 \
+	dataset.train.num_samples=10 dataset.valid.num_samples=10 \
+	dataset.evaluate.num_samples=10 dataset.generate.num_samples=2 sample_rate=16000 \
+	logging.level=DEBUG
+INTEG_COMPRESSION = $(INTEG) solver=compression/debug rvq.n_q=2 rvq.bins=48 checkpoint.save_last=true   # SIG is 616d7b3c
+INTEG_MUSICGEN = $(INTEG) solver=musicgen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 616d7b3c
+INTEG_AUDIOGEN = $(INTEG) solver=audiogen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 616d7b3c
+INTEG_MBD = $(INTEG) solver=diffusion/debug dset=audio/example  \
+	checkpoint.save_last=false  # Using compression model from 616d7b3c
+default: linter tests
+install:
+	pip install -U pip
+	pip install -U -e '.[dev]'
+linter:
+	flake8 audiocraft && mypy audiocraft
+	flake8 tests && mypy tests
+tests:
+	coverage run -m pytest tests
+	coverage report
+tests_integ:
+	$(INTEG_COMPRESSION)
+	$(INTEG_MBD)
+	$(INTEG_MUSICGEN)
+	$(INTEG_AUDIOGEN)
+api_docs:
+	pdoc3 --html -o api_docs -f audiocraft
+dist:
+	python setup.py sdist
+.PHONY: linter tests api_docs dist

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Musicgen
-emoji: 🏢
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 3.40.1
 app_file: app.py
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AudioCraft Plus v2.0.0a (MusicGen + AudioGen)
+emoji: 🎶
+colorFrom: yellow
+colorTo: green
 sdk: gradio
+sdk_version: 3.39.0
 app_file: app.py
+pinned: true
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
+# also released under the MIT license.
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import os
+from pathlib import Path
+import subprocess as sp
+from tempfile import NamedTemporaryFile
+import time
+import typing as tp
+import warnings
+import torch
+import gradio as gr
+from audiocraft.data.audio_utils import convert_audio
+from audiocraft.data.audio import audio_write
+from audiocraft.models import MusicGen, MultiBandDiffusion
+MODEL = None  # Last used model
+IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+print(IS_BATCHED)
+MAX_BATCH_SIZE = 12
+BATCHED_DURATION = 15
+INTERRUPTING = False
+MBD = None
+# We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
+_old_call = sp.call
+def _call_nostderr(*args, **kwargs):
+    # Avoid ffmpeg vomiting on the logs.
+    kwargs['stderr'] = sp.DEVNULL
+    kwargs['stdout'] = sp.DEVNULL
+    _old_call(*args, **kwargs)
+sp.call = _call_nostderr
+# Preallocating the pool of processes.
+pool = ProcessPoolExecutor(4)
+pool.__enter__()
+def interrupt():
+    global INTERRUPTING
+    INTERRUPTING = True
+class FileCleaner:
+    def __init__(self, file_lifetime: float = 3600):
+        self.file_lifetime = file_lifetime
+        self.files = []
+    def add(self, path: tp.Union[str, Path]):
+        self._cleanup()
+        self.files.append((time.time(), Path(path)))
+    def _cleanup(self):
+        now = time.time()
+        for time_added, path in list(self.files):
+            if now - time_added > self.file_lifetime:
+                if path.exists():
+                    path.unlink()
+                self.files.pop(0)
+            else:
+                break
+file_cleaner = FileCleaner()
+def make_waveform(*args, **kwargs):
+    # Further remove some warnings.
+    be = time.time()
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        out = gr.make_waveform(*args, **kwargs)
+        print("Make a video took", time.time() - be)
+        return out
+def load_model(version='facebook/musicgen-melody'):
+    global MODEL
+    print("Loading model", version)
+    if MODEL is None or MODEL.name != version:
+        MODEL = MusicGen.get_pretrained(version)
+def load_diffusion():
+    global MBD
+    if MBD is None:
+        print("loading MBD")
+        MBD = MultiBandDiffusion.get_mbd_musicgen()
+def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
+    MODEL.set_generation_params(duration=duration, **gen_kwargs)
+    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
+    be = time.time()
+    processed_melodies = []
+    target_sr = 32000
+    target_ac = 1
+    for melody in melodies:
+        if melody is None:
+            processed_melodies.append(None)
+        else:
+            sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
+            if melody.dim() == 1:
+                melody = melody[None]
+            melody = melody[..., :int(sr * duration)]
+            melody = convert_audio(melody, sr, target_sr, target_ac)
+            processed_melodies.append(melody)
+    if any(m is not None for m in processed_melodies):
+        outputs = MODEL.generate_with_chroma(
+            descriptions=texts,
+            melody_wavs=processed_melodies,
+            melody_sample_rate=target_sr,
+            progress=progress,
+            return_tokens=USE_DIFFUSION
+        )
+    else:
+        outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
+    if USE_DIFFUSION:
+        outputs_diffusion = MBD.tokens_to_wav(outputs[1])
+        outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
+    outputs = outputs.detach().cpu().float()
+    pending_videos = []
+    out_wavs = []
+    for output in outputs:
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, output, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            pending_videos.append(pool.submit(make_waveform, file.name))
+            out_wavs.append(file.name)
+            file_cleaner.add(file.name)
+    out_videos = [pending_video.result() for pending_video in pending_videos]
+    for video in out_videos:
+        file_cleaner.add(video)
+    print("batch finished", len(texts), time.time() - be)
+    print("Tempfiles currently stored: ", len(file_cleaner.files))
+    return out_videos, out_wavs
+def predict_batched(texts, melodies):
+    max_text_length = 512
+    texts = [text[:max_text_length] for text in texts]
+    load_model('facebook/musicgen-melody')
+    res = _do_predictions(texts, melodies, BATCHED_DURATION)
+    return res
+def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
+    global INTERRUPTING
+    global USE_DIFFUSION
+    INTERRUPTING = False
+    if temperature < 0:
+        raise gr.Error("Temperature must be >= 0.")
+    if topk < 0:
+        raise gr.Error("Topk must be non-negative.")
+    if topp < 0:
+        raise gr.Error("Topp must be non-negative.")
+    topk = int(topk)
+    if decoder == "MultiBand_Diffusion":
+        USE_DIFFUSION = True
+        load_diffusion()
+    else:
+        USE_DIFFUSION = False
+    load_model(model)
+    def _progress(generated, to_generate):
+        progress((min(generated, to_generate), to_generate))
+        if INTERRUPTING:
+            raise gr.Error("Interrupted.")
+    MODEL.set_custom_progress_callback(_progress)
+    videos, wavs = _do_predictions(
+        [text], [melody], duration, progress=True,
+        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
+    if USE_DIFFUSION:
+        return videos[0], wavs[0], videos[1], wavs[1]
+    return videos[0], wavs[0], None, None
+def toggle_audio_src(choice):
+    if choice == "mic":
+        return gr.update(source="microphone", value=None, label="Microphone")
+    else:
+        return gr.update(source="upload", value=None, label="File")
+def toggle_diffusion(choice):
+    if choice == "MultiBand_Diffusion":
+        return [gr.update(visible=True)] * 2
+    else:
+        return [gr.update(visible=False)] * 2
+def ui_full(launch_kwargs):
+    with gr.Blocks() as interface:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
+            a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Input Text", interactive=True)
+                    with gr.Column():
+                        radio = gr.Radio(["file", "mic"], value="file",
+                                         label="Condition on a melody (optional) File or Mic")
+                        melody = gr.Audio(source="upload", type="numpy", label="File",
+                                          interactive=True, elem_id="melody-input")
+                with gr.Row():
+                    submit = gr.Button("Submit")
+                    # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
+                    _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
+                with gr.Row():
+                    model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
+                                      "facebook/musicgen-large"],
+                                     label="Model", value="facebook/musicgen-melody", interactive=True)
+                with gr.Row():
+                    decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
+                                       label="Decoder", value="Default", interactive=True)
+                with gr.Row():
+                    duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
+                with gr.Row():
+                    topk = gr.Number(label="Top-k", value=250, interactive=True)
+                    topp = gr.Number(label="Top-p", value=0, interactive=True)
+                    temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
+                    cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+                audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
+                diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
+                audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
+        submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
+                     show_progress=False).then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
+                                                                     temperature, cfg_coef],
+                                               outputs=[output, audio_output, diffusion_output, audio_diffusion])
+        radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
+        gr.Examples(
+            fn=predict_full,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                    "facebook/musicgen-melody",
+                    "Default"
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                    "facebook/musicgen-melody",
+                    "Default"
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                    "facebook/musicgen-medium",
+                    "Default"
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
+                    "./assets/bach.mp3",
+                    "facebook/musicgen-melody",
+                    "Default"
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                    "facebook/musicgen-medium",
+                    "Default"
+                ],
+                [
+                    "Punk rock with loud drum and power guitar",
+                    None,
+                    "facebook/musicgen-medium",
+                    "MultiBand_Diffusion"
+                ],
+            ],
+            inputs=[text, melody, model, decoder],
+            outputs=[output]
+        )
+        gr.Markdown(
+            """
+            ### More details
+            The model will generate a short music extract based on the description you provided.
+            The model can generate up to 30 seconds of audio in one pass. It is now possible
+            to extend the generation by feeding back the end of the previous chunk of audio.
+            This can take a long time, and the model might lose consistency. The model might also
+            decide at arbitrary positions that the song ends.
+            **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
+            An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
+            are generated each time.
+            We present 4 model variations:
+            1. facebook/musicgen-melody -- a music generation model capable of generating music condition
+                on text and melody inputs. **Note**, you can also use text only.
+            2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
+            3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
+            4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
+            We also present two way of decoding the audio tokens
+            1. Use the default GAN based compression model
+            2. Use MultiBand Diffusion from (paper linknano )
+            When using `facebook/musicgen-melody`, you can optionally provide a reference audio from
+            which a broad melody will be extracted. The model will then try to follow both
+            the description and melody provided.
+            You can also use your own GPU or a Google Colab by following the instructions on our repo.
+            See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+            for more details.
+            """
+        )
+        interface.queue().launch(**launch_kwargs)
+def ui_batched(launch_kwargs):
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
+            a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
+            <br/>
+            <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
+                style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+                src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+            for longer sequences, more control and no queue.</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
+                    with gr.Column():
+                        radio = gr.Radio(["file", "mic"], value="file",
+                                         label="Condition on a melody (optional) File or Mic")
+                        melody = gr.Audio(source="upload", type="numpy", label="File",
+                                          interactive=True, elem_id="melody-input")
+                with gr.Row():
+                    submit = gr.Button("Generate")
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+                audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
+        submit.click(predict_batched, inputs=[text, melody],
+                     outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
+        radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
+        gr.Examples(
+            fn=predict_batched,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                ],
+            ],
+            inputs=[text, melody],
+            outputs=[output]
+        )
+        gr.Markdown("""
+        ### More details
+        The model will generate 12 seconds of audio based on the description you provided.
+        You can optionally provide a reference audio from which a broad melody will be extracted.
+        The model will then try to follow both the description and melody provided.
+        All samples are generated with the `melody` model.
+        You can also use your own GPU or a Google Colab by following the instructions on our repo.
+        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+        for more details.
+        """)
+        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    args = parser.parse_args()
+    launch_kwargs = {}
+    launch_kwargs['server_name'] = args.listen
+    if args.username and args.password:
+        launch_kwargs['auth'] = (args.username, args.password)
+    if args.server_port:
+        launch_kwargs['server_port'] = args.server_port
+    if args.inbrowser:
+        launch_kwargs['inbrowser'] = args.inbrowser
+    if args.share:
+        launch_kwargs['share'] = args.share
+    # Show the interface
+    if IS_BATCHED:
+        global USE_DIFFUSION
+        USE_DIFFUSION = False
+        ui_batched(launch_kwargs)
+    else:
+        ui_full(launch_kwargs)

app_v2.py ADDED Viewed

	@@ -0,0 +1,1839 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
+# also released under the MIT license.
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import os
+from pathlib import Path
+import subprocess as sp
+from tempfile import NamedTemporaryFile
+import time
+import warnings
+import glob
+import re
+from PIL import Image
+from pydub import AudioSegment
+from datetime import datetime
+import json
+import shutil
+import taglib
+import torch
+import torchaudio
+import gradio as gr
+import numpy as np
+import typing as tp
+from audiocraft.data.audio_utils import convert_audio
+from audiocraft.data.audio import audio_write
+from audiocraft.models import AudioGen, MusicGen, MultiBandDiffusion
+from audiocraft.utils import ui
+import random, string
+version = "2.0.0a"
+theme = gr.themes.Base(
+    primary_hue="lime",
+    secondary_hue="lime",
+    neutral_hue="neutral",
+).set(
+    button_primary_background_fill_hover='*primary_500',
+    button_primary_background_fill_hover_dark='*primary_500',
+    button_secondary_background_fill_hover='*primary_500',
+    button_secondary_background_fill_hover_dark='*primary_500'
+)
+MODEL = None  # Last used model
+MODELS = None
+UNLOAD_MODEL = False
+MOVE_TO_CPU = False
+IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+print(IS_BATCHED)
+MAX_BATCH_SIZE = 12
+BATCHED_DURATION = 15
+INTERRUPTING = False
+MBD = None
+# We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
+_old_call = sp.call
+def generate_random_string(length):
+    characters = string.ascii_letters + string.digits
+    return ''.join(random.choice(characters) for _ in range(length))
+def resize_video(input_path, output_path, target_width, target_height):
+    ffmpeg_cmd = [
+        'ffmpeg',
+        '-y',
+        '-i', input_path,
+        '-vf', f'scale={target_width}:{target_height}',
+        '-c:a', 'copy',
+        output_path
+    ]
+    sp.run(ffmpeg_cmd)
+def _call_nostderr(*args, **kwargs):
+    # Avoid ffmpeg vomiting on the logs.
+    kwargs['stderr'] = sp.DEVNULL
+    kwargs['stdout'] = sp.DEVNULL
+    _old_call(*args, **kwargs)
+sp.call = _call_nostderr
+# Preallocating the pool of processes.
+pool = ProcessPoolExecutor(4)
+pool.__enter__()
+def interrupt():
+    global INTERRUPTING
+    INTERRUPTING = True
+class FileCleaner:
+    def __init__(self, file_lifetime: float = 3600):
+        self.file_lifetime = file_lifetime
+        self.files = []
+    def add(self, path: tp.Union[str, Path]):
+        self._cleanup()
+        self.files.append((time.time(), Path(path)))
+    def _cleanup(self):
+        now = time.time()
+        for time_added, path in list(self.files):
+            if now - time_added > self.file_lifetime:
+                if path.exists():
+                    path.unlink()
+                self.files.pop(0)
+            else:
+                break
+file_cleaner = FileCleaner()
+def make_waveform(*args, **kwargs):
+    # Further remove some warnings.
+    be = time.time()
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        height = kwargs.pop('height')
+        width = kwargs.pop('width')
+        if height < 256:
+            height = 256
+        if width < 256:
+            width = 256
+        waveform_video = gr.make_waveform(*args, **kwargs)
+        out = f"{generate_random_string(12)}.mp4"
+        image = kwargs.get('bg_image', None)
+        if image is None:
+            resize_video(waveform_video, out, 900, 300)
+        else:
+            resize_video(waveform_video, out, width, height)
+        print("Make a video took", time.time() - be)
+        return out
+def load_model(version='GrandaddyShmax/musicgen-melody', custom_model=None, base_model='GrandaddyShmax/musicgen-medium', gen_type="music"):
+    global MODEL, MODELS
+    print("Loading model", version)
+    if MODELS is None:
+        if version == 'GrandaddyShmax/musicgen-custom':
+            MODEL = MusicGen.get_pretrained(base_model)
+            file_path = os.path.abspath("models/" + str(custom_model) + ".pt")
+            MODEL.lm.load_state_dict(torch.load(file_path))
+        else:
+            if gen_type == "music":
+                MODEL = MusicGen.get_pretrained(version)
+            elif gen_type == "audio":
+                MODEL = AudioGen.get_pretrained(version)
+        return
+    else:
+        t1 = time.monotonic()
+        if MODEL is not None:
+            MODEL.to('cpu') # move to cache
+            print("Previous model moved to CPU in %.2fs" % (time.monotonic() - t1))
+            t1 = time.monotonic()
+        if version != 'GrandaddyShmax/musicgen-custom' and MODELS.get(version) is None:
+            print("Loading model %s from disk" % version)
+            if gen_type == "music":
+                result = MusicGen.get_pretrained(version)
+            elif gen_type == "audio":
+                result = AudioGen.get_pretrained(version)
+            MODELS[version] = result
+            print("Model loaded in %.2fs" % (time.monotonic() - t1))
+            MODEL = result
+            return
+        result = MODELS[version].to('cuda')
+        print("Cached model loaded in %.2fs" % (time.monotonic() - t1))
+        MODEL = result
+def get_audio_info(audio_path):
+    if audio_path is not None:
+        if audio_path.name.endswith(".wav") or audio_path.name.endswith(".mp4") or audio_path.name.endswith(".json"):
+            if not audio_path.name.endswith(".json"):
+                with taglib.File(audio_path.name, save_on_exit=False) as song:
+                    if 'COMMENT' not in song.tags:
+                        return "No tags found. Either the file is not generated by MusicGen+ V1.2.7 and higher or the tags are corrupted. (Discord removes metadata from mp4 and wav files, so you can't use them)"
+                    json_string = song.tags['COMMENT'][0]
+                    data = json.loads(json_string)
+                    global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
+                    bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
+                    key = str("\nKey: " + data['key']) if 'key' in data else ""
+                    scale = str("\nScale: " + data['scale']) if 'scale' in data else ""
+                    prompts = str("\nPrompts: " + (data['texts'] if data['texts'] != "['']" else "none")) if 'texts' in data else ""
+                    duration = str("\nDuration: " + data['duration']) if 'duration' in data else ""
+                    overlap = str("\nOverlap: " + data['overlap']) if 'overlap' in data else ""
+                    seed = str("\nSeed: " + data['seed']) if 'seed' in data else ""
+                    audio_mode = str("\nAudio Mode: " + data['audio_mode']) if 'audio_mode' in data else ""
+                    input_length = str("\nInput Length: " + data['input_length']) if 'input_length' in data else ""
+                    channel = str("\nChannel: " + data['channel']) if 'channel' in data else ""
+                    sr_select = str("\nSample Rate: " + data['sr_select']) if 'sr_select' in data else ""
+                    gen_type = str(data['generator'] + "gen-") if 'generator' in data else ""
+                    model = str("\nModel: " + gen_type + data['model']) if 'model' in data else ""
+                    custom_model = str("\nCustom Model: " + data['custom_model']) if 'custom_model' in data else ""
+                    base_model = str("\nBase Model: " + data['base_model']) if 'base_model' in data else ""
+                    decoder = str("\nDecoder: " + data['decoder']) if 'decoder' in data else ""
+                    topk = str("\nTopk: " + data['topk']) if 'topk' in data else ""
+                    topp = str("\nTopp: " + data['topp']) if 'topp' in data else ""
+                    temperature = str("\nTemperature: " + data['temperature']) if 'temperature' in data else ""
+                    cfg_coef = str("\nClassifier Free Guidance: " + data['cfg_coef']) if 'cfg_coef' in data else ""
+                    version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
+                    info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
+                    if info == "":
+                        return "No tags found. Either the file is not generated by MusicGen+ V1.2.7 and higher or the tags are corrupted. (Discord removes metadata from mp4 and wav files, so you can't use them)"
+                    return info
+            else:
+                with open(audio_path.name) as json_file:
+                    data = json.load(json_file)
+                    #if 'global_prompt' not in data:
+                        #return "No tags found. Either the file is not generated by MusicGen+ V1.2.8a and higher or the tags are corrupted."
+                    global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
+                    bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
+                    key = str("\nKey: " + data['key']) if 'key' in data else ""
+                    scale = str("\nScale: " + data['scale']) if 'scale' in data else ""
+                    prompts = str("\nPrompts: " + (data['texts'] if data['texts'] != "['']" else "none")) if 'texts' in data else ""
+                    duration = str("\nDuration: " + data['duration']) if 'duration' in data else ""
+                    overlap = str("\nOverlap: " + data['overlap']) if 'overlap' in data else ""
+                    seed = str("\nSeed: " + data['seed']) if 'seed' in data else ""
+                    audio_mode = str("\nAudio Mode: " + data['audio_mode']) if 'audio_mode' in data else ""
+                    input_length = str("\nInput Length: " + data['input_length']) if 'input_length' in data else ""
+                    channel = str("\nChannel: " + data['channel']) if 'channel' in data else ""
+                    sr_select = str("\nSample Rate: " + data['sr_select']) if 'sr_select' in data else ""
+                    gen_type = str(data['generator'] + "gen-") if 'generator' in data else ""
+                    model = str("\nModel: " + gen_type + data['model']) if 'model' in data else ""
+                    custom_model = str("\nCustom Model: " + data['custom_model']) if 'custom_model' in data else ""
+                    base_model = str("\nBase Model: " + data['base_model']) if 'base_model' in data else ""
+                    decoder = str("\nDecoder: " + data['decoder']) if 'decoder' in data else ""
+                    topk = str("\nTopk: " + data['topk']) if 'topk' in data else ""
+                    topp = str("\nTopp: " + data['topp']) if 'topp' in data else ""
+                    temperature = str("\nTemperature: " + data['temperature']) if 'temperature' in data else ""
+                    cfg_coef = str("\nClassifier Free Guidance: " + data['cfg_coef']) if 'cfg_coef' in data else ""
+                    version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
+                    info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
+                    if info == "":
+                        return "No tags found. Either the file is not generated by MusicGen+ V1.2.7 and higher or the tags are corrupted."
+                    return info
+        else:
+            return "Only .wav ,.mp4 and .json files are supported"
+    else:
+        return None
+def info_to_params(audio_path):
+    if audio_path is not None:
+        if audio_path.name.endswith(".wav") or audio_path.name.endswith(".mp4") or audio_path.name.endswith(".json"):
+            if not audio_path.name.endswith(".json"):
+                with taglib.File(audio_path.name, save_on_exit=False) as song:
+                    if 'COMMENT' not in song.tags:
+                        return "Default", False, "", 120, "C", "Major", "large", None, "medium", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "sample", 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+                    json_string = song.tags['COMMENT'][0]
+                    data = json.loads(json_string)
+                    struc_prompt = (False if data['bpm'] == "none" else True) if 'bpm' in data else False
+                    global_prompt = data['global_prompt'] if 'global_prompt' in data else ""
+                    bpm = (120 if data['bpm'] == "none" else int(data['bpm'])) if 'bpm' in data else 120
+                    key = ("C" if data['key'] == "none" else data['key']) if 'key' in data else "C"
+                    scale = ("Major" if data['scale'] == "none" else data['scale']) if 'scale' in data else "Major"
+                    model = data['model'] if 'model' in data else "large"
+                    custom_model = (data['custom_model'] if data['custom_model'] in get_available_models() else None) if 'custom_model' in data else None
+                    base_model = data['base_model'] if 'base_model' in data else "medium"
+                    decoder = data['decoder'] if 'decoder' in data else "Default"
+                    if 'texts' not in data:
+                        unique_prompts = 1
+                        text = ["", "", "", "", "", "", "", "", "", ""]
+                        repeat = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    else:
+                        s = data['texts']
+                        s = re.findall(r"'(.*?)'", s)
+                        text = []
+                        repeat = []
+                        i = 0
+                        for elem in s:
+                            if elem.strip():
+                                if i == 0 or elem != s[i-1]:
+                                    text.append(elem)
+                                    repeat.append(1)
+                                else:
+                                    repeat[-1] += 1
+                            i += 1
+                        text.extend([""] * (10 - len(text)))
+                        repeat.extend([1] * (10 - len(repeat)))
+                        unique_prompts = len([t for t in text if t])
+                    audio_mode = ("sample" if data['audio_mode'] == "none" else data['audio_mode']) if 'audio_mode' in data else "sample"
+                    duration = int(data['duration']) if 'duration' in data else 10
+                    topk = float(data['topk']) if 'topk' in data else 250
+                    topp = float(data['topp']) if 'topp' in data else 0
+                    temperature = float(data['temperature']) if 'temperature' in data else 1.0
+                    cfg_coef = float(data['cfg_coef']) if 'cfg_coef' in data else 5.0
+                    seed = int(data['seed']) if 'seed' in data else -1
+                    overlap = int(data['overlap']) if 'overlap' in data else 12
+                    channel = data['channel'] if 'channel' in data else "stereo"
+                    sr_select = data['sr_select'] if 'sr_select' in data else "48000"
+                    return decoder, struc_prompt, global_prompt, bpm, key, scale, model, custom_model, base_model, unique_prompts, text[0], text[1], text[2], text[3], text[4], text[5], text[6], text[7], text[8], text[9], repeat[0], repeat[1], repeat[2], repeat[3], repeat[4], repeat[5], repeat[6], repeat[7], repeat[8], repeat[9], audio_mode, duration, topk, topp, temperature, cfg_coef, seed, overlap, channel, sr_select
+            else:
+                with open(audio_path.name) as json_file:
+                    data = json.load(json_file)
+                    struc_prompt = (False if data['bpm'] == "none" else True) if 'bpm' in data else False
+                    global_prompt = data['global_prompt'] if 'global_prompt' in data else ""
+                    bpm = (120 if data['bpm'] == "none" else int(data['bpm'])) if 'bpm' in data else 120
+                    key = ("C" if data['key'] == "none" else data['key']) if 'key' in data else "C"
+                    scale = ("Major" if data['scale'] == "none" else data['scale']) if 'scale' in data else "Major"
+                    model = data['model'] if 'model' in data else "large"
+                    custom_model = (data['custom_model'] if data['custom_model'] in get_available_models() else None) if 'custom_model' in data else None
+                    base_model = data['base_model'] if 'base_model' in data else "medium"
+                    decoder = data['decoder'] if 'decoder' in data else "Default"
+                    if 'texts' not in data:
+                        unique_prompts = 1
+                        text = ["", "", "", "", "", "", "", "", "", ""]
+                        repeat = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    else:
+                        s = data['texts']
+                        s = re.findall(r"'(.*?)'", s)
+                        text = []
+                        repeat = []
+                        i = 0
+                        for elem in s:
+                            if elem.strip():
+                                if i == 0 or elem != s[i-1]:
+                                    text.append(elem)
+                                    repeat.append(1)
+                                else:
+                                    repeat[-1] += 1
+                            i += 1
+                        text.extend([""] * (10 - len(text)))
+                        repeat.extend([1] * (10 - len(repeat)))
+                        unique_prompts = len([t for t in text if t])
+                    audio_mode = ("sample" if data['audio_mode'] == "none" else data['audio_mode']) if 'audio_mode' in data else "sample"
+                    duration = int(data['duration']) if 'duration' in data else 10
+                    topk = float(data['topk']) if 'topk' in data else 250
+                    topp = float(data['topp']) if 'topp' in data else 0
+                    temperature = float(data['temperature']) if 'temperature' in data else 1.0
+                    cfg_coef = float(data['cfg_coef']) if 'cfg_coef' in data else 5.0
+                    seed = int(data['seed']) if 'seed' in data else -1
+                    overlap = int(data['overlap']) if 'overlap' in data else 12
+                    channel = data['channel'] if 'channel' in data else "stereo"
+                    sr_select = data['sr_select'] if 'sr_select' in data else "48000"
+                    return decoder, struc_prompt, global_prompt, bpm, key, scale, model, custom_model, base_model, unique_prompts, text[0], text[1], text[2], text[3], text[4], text[5], text[6], text[7], text[8], text[9], repeat[0], repeat[1], repeat[2], repeat[3], repeat[4], repeat[5], repeat[6], repeat[7], repeat[8], repeat[9], audio_mode, duration, topk, topp, temperature, cfg_coef, seed, overlap, channel, sr_select
+        else:
+            return "Default", False, "", 120, "C", "Major", "large", None, "medium", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "sample", 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+    else:
+        return "Default", False, "", 120, "C", "Major", "large", None, "medium", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "sample", 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+def info_to_params_a(audio_path):
+    if audio_path is not None:
+        if audio_path.name.endswith(".wav") or audio_path.name.endswith(".mp4") or audio_path.name.endswith(".json"):
+            if not audio_path.name.endswith(".json"):
+                with taglib.File(audio_path.name, save_on_exit=False) as song:
+                    if 'COMMENT' not in song.tags:
+                        return "Default", False, "", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+                    json_string = song.tags['COMMENT'][0]
+                    data = json.loads(json_string)
+                    struc_prompt = (False if data['global_prompt'] == "" else True) if 'global_prompt' in data else False
+                    global_prompt = data['global_prompt'] if 'global_prompt' in data else ""
+                    decoder = data['decoder'] if 'decoder' in data else "Default"
+                    if 'texts' not in data:
+                        unique_prompts = 1
+                        text = ["", "", "", "", "", "", "", "", "", ""]
+                        repeat = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    else:
+                        s = data['texts']
+                        s = re.findall(r"'(.*?)'", s)
+                        text = []
+                        repeat = []
+                        i = 0
+                        for elem in s:
+                            if elem.strip():
+                                if i == 0 or elem != s[i-1]:
+                                    text.append(elem)
+                                    repeat.append(1)
+                                else:
+                                    repeat[-1] += 1
+                            i += 1
+                        text.extend([""] * (10 - len(text)))
+                        repeat.extend([1] * (10 - len(repeat)))
+                        unique_prompts = len([t for t in text if t])
+                    duration = int(data['duration']) if 'duration' in data else 10
+                    topk = float(data['topk']) if 'topk' in data else 250
+                    topp = float(data['topp']) if 'topp' in data else 0
+                    temperature = float(data['temperature']) if 'temperature' in data else 1.0
+                    cfg_coef = float(data['cfg_coef']) if 'cfg_coef' in data else 5.0
+                    seed = int(data['seed']) if 'seed' in data else -1
+                    overlap = int(data['overlap']) if 'overlap' in data else 12
+                    channel = data['channel'] if 'channel' in data else "stereo"
+                    sr_select = data['sr_select'] if 'sr_select' in data else "48000"
+                    return decoder, struc_prompt, global_prompt, unique_prompts, text[0], text[1], text[2], text[3], text[4], text[5], text[6], text[7], text[8], text[9], repeat[0], repeat[1], repeat[2], repeat[3], repeat[4], repeat[5], repeat[6], repeat[7], repeat[8], repeat[9], duration, topk, topp, temperature, cfg_coef, seed, overlap, channel, sr_select
+            else:
+                with open(audio_path.name) as json_file:
+                    data = json.load(json_file)
+                    struc_prompt = (False if data['global_prompt'] == "" else True) if 'global_prompt' in data else False
+                    global_prompt = data['global_prompt'] if 'global_prompt' in data else ""
+                    decoder = data['decoder'] if 'decoder' in data else "Default"
+                    if 'texts' not in data:
+                        unique_prompts = 1
+                        text = ["", "", "", "", "", "", "", "", "", ""]
+                        repeat = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    else:
+                        s = data['texts']
+                        s = re.findall(r"'(.*?)'", s)
+                        text = []
+                        repeat = []
+                        i = 0
+                        for elem in s:
+                            if elem.strip():
+                                if i == 0 or elem != s[i-1]:
+                                    text.append(elem)
+                                    repeat.append(1)
+                                else:
+                                    repeat[-1] += 1
+                            i += 1
+                        text.extend([""] * (10 - len(text)))
+                        repeat.extend([1] * (10 - len(repeat)))
+                        unique_prompts = len([t for t in text if t])
+                    duration = int(data['duration']) if 'duration' in data else 10
+                    topk = float(data['topk']) if 'topk' in data else 250
+                    topp = float(data['topp']) if 'topp' in data else 0
+                    temperature = float(data['temperature']) if 'temperature' in data else 1.0
+                    cfg_coef = float(data['cfg_coef']) if 'cfg_coef' in data else 5.0
+                    seed = int(data['seed']) if 'seed' in data else -1
+                    overlap = int(data['overlap']) if 'overlap' in data else 12
+                    channel = data['channel'] if 'channel' in data else "stereo"
+                    sr_select = data['sr_select'] if 'sr_select' in data else "48000"
+                    return decoder, struc_prompt, global_prompt, unique_prompts, text[0], text[1], text[2], text[3], text[4], text[5], text[6], text[7], text[8], text[9], repeat[0], repeat[1], repeat[2], repeat[3], repeat[4], repeat[5], repeat[6], repeat[7], repeat[8], repeat[9], duration, topk, topp, temperature, cfg_coef, seed, overlap, channel, sr_select
+        else:
+            return "Default", False, "", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+    else:
+        return "Default", False, "", 1, "", "", "", "", "", "", "", "", "", "", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 250, 0, 1.0, 5.0, -1, 12, "stereo", "48000"
+def make_pseudo_stereo (filename, sr_select, pan, delay):
+    if pan:
+        temp = AudioSegment.from_wav(filename)
+        if sr_select != "32000":
+            temp = temp.set_frame_rate(int(sr_select))
+        left = temp.pan(-0.5) - 5
+        right = temp.pan(0.6) - 5
+        temp = left.overlay(right, position=5)
+        temp.export(filename, format="wav")
+    if delay:
+        waveform, sample_rate = torchaudio.load(filename) # load mono WAV file
+        delay_seconds = 0.01 # set delay 10ms
+        delay_samples = int(delay_seconds * sample_rate) # Calculating delay value in number of samples
+        stereo_waveform = torch.stack([waveform[0], torch.cat((torch.zeros(delay_samples), waveform[0][:-delay_samples]))]) # Generate a stereo file with original mono audio and delayed version
+        torchaudio.save(filename, stereo_waveform, sample_rate)
+    return
+def normalize_audio(audio_data):
+    audio_data = audio_data.astype(np.float32)
+    max_value = np.max(np.abs(audio_data))
+    audio_data /= max_value
+    return audio_data
+def load_diffusion():
+    global MBD
+    if MBD is None:
+        print("loading MBD")
+        MBD = MultiBandDiffusion.get_mbd_musicgen()
+def unload_diffusion():
+    global MBD
+    if MBD is not None:
+        print("unloading MBD")
+        MBD = None
+def _do_predictions(gen_type, texts, melodies, sample, trim_start, trim_end, duration, image, height, width, background, bar1, bar2, channel, sr_select, progress=False, **gen_kwargs):
+    if gen_type == "music":
+        maximum_size = 29.5
+    elif gen_type == "audio":
+        maximum_size = 9.5
+    cut_size = 0
+    input_length = 0
+    sampleP = None
+    if sample is not None:
+        globalSR, sampleM = sample[0], sample[1]
+        sampleM = normalize_audio(sampleM)
+        sampleM = torch.from_numpy(sampleM).t()
+        if sampleM.dim() == 1:
+            sampleM = sampleM.unsqueeze(0)
+        sample_length = sampleM.shape[sampleM.dim() - 1] / globalSR
+        if trim_start >= sample_length:
+            trim_start = sample_length - 0.5
+        if trim_end >= sample_length:
+            trim_end = sample_length - 0.5
+        if trim_start + trim_end >= sample_length:
+            tmp = sample_length - 0.5
+            trim_start = tmp / 2
+            trim_end = tmp / 2
+        sampleM = sampleM[..., int(globalSR * trim_start):int(globalSR * (sample_length - trim_end))]
+        sample_length = sample_length - (trim_start + trim_end)
+        if sample_length > maximum_size:
+            cut_size = sample_length - maximum_size
+            sampleP = sampleM[..., :int(globalSR * cut_size)]
+            sampleM = sampleM[..., int(globalSR * cut_size):]
+        if sample_length >= duration:
+            duration = sample_length + 0.5
+        input_length = sample_length
+    global MODEL
+    MODEL.set_generation_params(duration=(duration - cut_size), **gen_kwargs)
+    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies], [None if sample is None else (sample[0], sample[1].shape)])
+    be = time.time()
+    processed_melodies = []
+    if gen_type == "music":
+        target_sr = 32000
+    elif gen_type == "audio":
+        target_sr = 16000
+    target_ac = 1
+    for melody in melodies:
+        if melody is None:
+            processed_melodies.append(None)
+        else:
+            sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
+            if melody.dim() == 1:
+                melody = melody[None]
+            melody = melody[..., :int(sr * duration)]
+            melody = convert_audio(melody, sr, target_sr, target_ac)
+            processed_melodies.append(melody)
+    if sample is not None:
+        if sampleP is None:
+            if gen_type == "music":
+                outputs = MODEL.generate_continuation(
+                    prompt=sampleM,
+                    prompt_sample_rate=globalSR,
+                    descriptions=texts,
+                    progress=progress,
+                    return_tokens=USE_DIFFUSION
+                )
+            elif gen_type == "audio":
+                outputs = MODEL.generate_continuation(
+                    prompt=sampleM,
+                    prompt_sample_rate=globalSR,
+                    descriptions=texts,
+                    progress=progress
+                )
+        else:
+            if sampleP.dim() > 1:
+                sampleP = convert_audio(sampleP, globalSR, target_sr, target_ac)
+            sampleP = sampleP.to(MODEL.device).float().unsqueeze(0)
+            if gen_type == "music":
+                outputs = MODEL.generate_continuation(
+                    prompt=sampleM,
+                    prompt_sample_rate=globalSR,
+                    descriptions=texts,
+                    progress=progress,
+                    return_tokens=USE_DIFFUSION
+                )
+            elif gen_type == "audio":
+                outputs = MODEL.generate_continuation(
+                    prompt=sampleM,
+                    prompt_sample_rate=globalSR,
+                    descriptions=texts,
+                    progress=progress
+                )
+            outputs = torch.cat([sampleP, outputs], 2)
+    elif any(m is not None for m in processed_melodies):
+        if gen_type == "music":
+            outputs = MODEL.generate_with_chroma(
+                descriptions=texts,
+                melody_wavs=processed_melodies,
+                melody_sample_rate=target_sr,
+                progress=progress,
+                return_tokens=USE_DIFFUSION
+            )
+        elif gen_type == "audio":
+            outputs = MODEL.generate_with_chroma(
+                descriptions=texts,
+                melody_wavs=processed_melodies,
+                melody_sample_rate=target_sr,
+                progress=progress
+            )
+    else:
+        if gen_type == "music":
+            outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
+        elif gen_type == "audio":
+            outputs = MODEL.generate(texts, progress=progress)
+    if USE_DIFFUSION:
+        print("outputs: " + str(outputs))
+        outputs_diffusion = MBD.tokens_to_wav(outputs[1])
+        outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
+    outputs = outputs.detach().cpu().float()
+    backups = outputs
+    if channel == "stereo":
+        outputs = convert_audio(outputs, target_sr, int(sr_select), 2)
+    elif channel == "mono" and sr_select != "32000":
+        outputs = convert_audio(outputs, target_sr, int(sr_select), 1)
+    out_files = []
+    out_audios = []
+    out_backup = []
+    for output in outputs:
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, output, (MODEL.sample_rate if channel == "stereo effect" else int(sr_select)), strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            if channel == "stereo effect":
+                make_pseudo_stereo(file.name, sr_select, pan=True, delay=True);
+            out_files.append(pool.submit(make_waveform, file.name, bg_image=image, bg_color=background, bars_color=(bar1, bar2), fg_alpha=1.0, bar_count=75, height=height, width=width))
+            out_audios.append(file.name)
+            file_cleaner.add(file.name)
+            print(f'wav: {file.name}')
+    for backup in backups:
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, backup, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            out_backup.append(file.name)
+            file_cleaner.add(file.name)
+    res = [out_file.result() for out_file in out_files]
+    res_audio = out_audios
+    res_backup = out_backup
+    for file in res:
+        file_cleaner.add(file)
+        print(f'video: {file}')
+    print("batch finished", len(texts), time.time() - be)
+    print("Tempfiles currently stored: ", len(file_cleaner.files))
+    if MOVE_TO_CPU:
+        MODEL.to('cpu')
+    if UNLOAD_MODEL:
+        MODEL = None
+    torch.cuda.empty_cache()
+    torch.cuda.ipc_collect()
+    return res, res_audio, res_backup, input_length
+def predict_batched(texts, melodies):
+    max_text_length = 512
+    texts = [text[:max_text_length] for text in texts]
+    load_model('melody')
+    res = _do_predictions(texts, melodies, BATCHED_DURATION)
+    return res
+def add_tags(filename, tags):
+    json_string = None
+    data = {
+        "global_prompt": tags[0],
+        "bpm": tags[1],
+        "key": tags[2],
+        "scale": tags[3],
+        "texts": tags[4],
+        "duration": tags[5],
+        "overlap": tags[6],
+        "seed": tags[7],
+        "audio_mode": tags[8],
+        "input_length": tags[9],
+        "channel": tags[10],
+        "sr_select": tags[11],
+        "model": tags[12],
+        "custom_model": tags[13],
+        "base_model": tags[14],
+        "decoder": tags[15],
+        "topk": tags[16],
+        "topp": tags[17],
+        "temperature": tags[18],
+        "cfg_coef": tags[19],
+        "generator": tags[20],
+        "version": version
+        }
+    json_string = json.dumps(data)
+    if os.path.exists(filename):
+        with taglib.File(filename, save_on_exit=True) as song:
+            song.tags = {'COMMENT': json_string }
+    json_file = open(tags[7] + '.json', 'w')
+    json_file.write(json_string)
+    json_file.close()
+    return json_file.name;
+def save_outputs(mp4, wav_tmp, tags, gen_type):
+    # mp4: .mp4 file name in root running folder of app.py
+    # wav_tmp: temporary wav file located in %TEMP% folder
+    # seed - used seed
+    # exanple BgnJtr4Pn1AJ.mp4,  C:\Users\Alex\AppData\Local\Temp\tmp4ermrebs.wav,  195123182343465
+    # procedure read generated .mp4 and wav files, rename it by using seed as name,
+    # and will store it to ./output/today_date/wav and  ./output/today_date/mp4 folders.
+    # if file with same seed number already exist its make postfix in name like seed(n)
+    # where is n - consiqunce number 1-2-3-4 and so on
+    # then we store generated mp4 and wav into destination folders.
+    current_date = datetime.now().strftime("%Y%m%d")
+    wav_directory = os.path.join(os.getcwd(), 'output', current_date, gen_type,'wav')
+    mp4_directory = os.path.join(os.getcwd(), 'output', current_date, gen_type,'mp4')
+    json_directory = os.path.join(os.getcwd(), 'output', current_date, gen_type,'json')
+    os.makedirs(wav_directory, exist_ok=True)
+    os.makedirs(mp4_directory, exist_ok=True)
+    os.makedirs(json_directory, exist_ok=True)
+    filename = str(tags[7]) + '.wav'
+    target = os.path.join(wav_directory, filename)
+    counter = 1
+    while os.path.exists(target):
+        filename = str(tags[7]) + f'({counter})' + '.wav'
+        target = os.path.join(wav_directory, filename)
+        counter += 1
+    shutil.copyfile(wav_tmp, target); # make copy of original file
+    json_file = add_tags(target, tags);
+    wav_target=target;
+    target=target.replace('wav', 'mp4');
+    mp4_target=target;
+    mp4=r'./' +mp4;
+    shutil.copyfile(mp4, target); # make copy of original file
+    _ = add_tags(target, tags);
+    target=target.replace('mp4', 'json'); # change the extension to json
+    json_target=target; # store the json target
+    with open(target, 'w') as f: # open a writable file object
+        shutil.copyfile(json_file, target); # make copy of original file
+    os.remove(json_file)
+    return wav_target, mp4_target, json_target;
+def clear_cash():
+    # delete all temporary files genegated my system
+    current_date = datetime.now().date()
+    current_directory = os.getcwd()
+    files = glob.glob(os.path.join(current_directory, '*.mp4'))
+    for file in files:
+        creation_date = datetime.fromtimestamp(os.path.getctime(file)).date()
+        if creation_date == current_date:
+            os.remove(file)
+    temp_directory = os.environ.get('TEMP')
+    files = glob.glob(os.path.join(temp_directory, 'tmp*.mp4'))
+    for file in files:
+        creation_date = datetime.fromtimestamp(os.path.getctime(file)).date()
+        if creation_date == current_date:
+            os.remove(file)
+    files = glob.glob(os.path.join(temp_directory, 'tmp*.wav'))
+    for file in files:
+        creation_date = datetime.fromtimestamp(os.path.getctime(file)).date()
+        if creation_date == current_date:
+            os.remove(file)
+    files = glob.glob(os.path.join(temp_directory, 'tmp*.png'))
+    for file in files:
+        creation_date = datetime.fromtimestamp(os.path.getctime(file)).date()
+        if creation_date == current_date:
+            os.remove(file)
+    return
+def s2t(seconds, seconds2):
+    # convert seconds to time format
+    # seconds - time in seconds
+    # return time in format 00:00
+    m, s = divmod(seconds, 60)
+    m2, s2 = divmod(seconds2, 60)
+    if seconds != 0 and seconds < seconds2:
+        s = s + 1
+    return ("%02d:%02d - %02d:%02d" % (m, s, m2, s2))
+def calc_time(gen_type, s, duration, overlap, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9):
+    # calculate the time of generation
+    # overlap - overlap in seconds
+    # d0-d9 - drag
+    # return time in seconds
+    d_amount = [int(d0), int(d1), int(d2), int(d3), int(d4), int(d5), int(d6), int(d7), int(d8), int(d9)]
+    calc = []
+    tracks = []
+    time = 0
+    s = s - 1
+    max_time = duration
+    max_limit = 0
+    if gen_type == "music":
+        max_limit = 30
+    elif gen_type == "audio":
+        max_limit = 10
+    track_add = max_limit - overlap
+    tracks.append(max_limit + ((d_amount[0] - 1) * track_add))
+    for i in range(1, 10):
+        tracks.append(d_amount[i] * track_add)
+    if tracks[0] >= max_time or s == 0:
+        calc.append(s2t(time, max_time))
+        time = max_time
+    else:
+        calc.append(s2t(time, tracks[0]))
+        time = tracks[0]
+    for i in range(1, 10):
+        if time + tracks[i] >= max_time or i == s:
+            calc.append(s2t(time, max_time))
+            time = max_time
+        else:
+            calc.append(s2t(time, time + tracks[i]))
+            time = time + tracks[i]
+    return calc[0], calc[1], calc[2], calc[3], calc[4], calc[5], calc[6], calc[7], calc[8], calc[9]
+def predict_full(gen_type, model, decoder, custom_model, base_model, prompt_amount, struc_prompt, bpm, key, scale, global_prompt, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, audio, mode, trim_start, trim_end, duration, topk, topp, temperature, cfg_coef, seed, overlap, image, height, width, background, bar1, bar2, channel, sr_select, progress=gr.Progress()):
+    global INTERRUPTING
+    global USE_DIFFUSION
+    INTERRUPTING = False
+    if gen_type == "audio":
+        custom_model = None
+        base_model = "medium"
+    if temperature < 0:
+        raise gr.Error("Temperature must be >= 0.")
+    if topk < 0:
+        raise gr.Error("Topk must be non-negative.")
+    if topp < 0:
+        raise gr.Error("Topp must be non-negative.")
+    if trim_start < 0:
+        trim_start = 0
+    if trim_end < 0:
+        trim_end = 0
+    topk = int(topk)
+    if decoder == "MultiBand_Diffusion":
+        USE_DIFFUSION = True
+        load_diffusion()
+    else:
+        USE_DIFFUSION = False
+        unload_diffusion()
+    if gen_type == "music":
+        model_shrt = model
+        model = "GrandaddyShmax/musicgen-" + model
+    elif gen_type == "audio":
+        model_shrt = model
+        model = "GrandaddyShmax/audiogen-" + model
+    base_model_shrt = base_model
+    base_model = "GrandaddyShmax/musicgen-" + base_model
+    if MODEL is None or MODEL.name != (model):
+        load_model(model, custom_model, base_model, gen_type)
+    else:
+        if MOVE_TO_CPU:
+            MODEL.to('cuda')
+    if seed < 0:
+        seed = random.randint(0, 0xffff_ffff_ffff)
+    torch.manual_seed(seed)
+    def _progress(generated, to_generate):
+        progress((min(generated, to_generate), to_generate))
+        if INTERRUPTING:
+            raise gr.Error("Interrupted.")
+    MODEL.set_custom_progress_callback(_progress)
+    audio_mode = "none"
+    melody = None
+    sample = None
+    if audio:
+      audio_mode = mode
+      if mode == "sample":
+          sample = audio
+      elif mode == "melody":
+          melody = audio
+    base_model = "none" if model != "custom" else base_model
+    custom_model = "none" if model != "custom" else custom_model
+    text_cat = [p0, p1, p2, p3, p4, p5, p6, p7, p8, p9]
+    drag_cat = [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9]
+    texts = []
+    raw_texts = []
+    ind = 0
+    ind2 = 0
+    while ind < prompt_amount:
+        for ind2 in range(int(drag_cat[ind])):
+            if not struc_prompt:
+                texts.append(text_cat[ind])
+                global_prompt = "none"
+                bpm = "none"
+                key = "none"
+                scale = "none"
+                raw_texts.append(text_cat[ind])
+            else:
+                if gen_type == "music":
+                    bpm_str = str(bpm) + " bpm"
+                    key_str = ", " + str(key) + " " + str(scale)
+                    global_str = (", " + str(global_prompt)) if str(global_prompt) != "" else ""
+                elif gen_type == "audio":
+                    bpm_str = ""
+                    key_str = ""
+                    global_str = (str(global_prompt)) if str(global_prompt) != "" else ""
+                texts_str = (", " + str(text_cat[ind])) if str(text_cat[ind]) != "" else ""
+                texts.append(bpm_str + key_str + global_str + texts_str)
+                raw_texts.append(text_cat[ind])
+        ind2 = 0
+        ind = ind + 1
+    outs, outs_audio, outs_backup, input_length = _do_predictions(
+        gen_type, [texts], [melody], sample, trim_start, trim_end, duration, image, height, width, background, bar1, bar2, channel, sr_select, progress=True,
+        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef, extend_stride=MODEL.max_duration-overlap)
+    tags = [str(global_prompt), str(bpm), str(key), str(scale), str(raw_texts), str(duration), str(overlap), str(seed), str(audio_mode), str(input_length), str(channel), str(sr_select), str(model_shrt), str(custom_model), str(base_model_shrt), str(decoder), str(topk), str(topp), str(temperature), str(cfg_coef), str(gen_type)]
+    wav_target, mp4_target, json_target = save_outputs(outs[0], outs_audio[0], tags, gen_type);
+    # Removes the temporary files.
+    for out in outs:
+        os.remove(out)
+    for out in outs_audio:
+        os.remove(out)
+    return mp4_target, wav_target, outs_backup[0], [mp4_target, wav_target, json_target], seed
+max_textboxes = 10
+def get_available_models():
+    return sorted([re.sub('.pt$', '', item.name) for item in list(Path('models/').glob('*')) if item.name.endswith('.pt')])
+def toggle_audio_src(choice):
+    if choice == "mic":
+        return gr.update(source="microphone", value=None, label="Microphone")
+    else:
+        return gr.update(source="upload", value=None, label="File")
+def ui_full(launch_kwargs):
+    with gr.Blocks(title='AudioCraft Plus', theme=theme) as interface:
+        gr.Markdown(
+            """
+            # AudioCraft Plus - v2.0.0a
+            ### An All-in-One AudioCraft WebUI
+            #### **Disclaimer:** This will not run on CPU only. Its best to clone this App and run on GPU instance!
+            **Alternatively**, you can run this for free on a google colab:
+            https://colab.research.google.com/github/camenduru/MusicGen-colab/blob/main/MusicGen_ClownOfMadness_plus_colab.ipynb
+            **Or**, run this locally on your PC:
+            https://github.com/GrandaddyShmax/audiocraft_plus/tree/main
+            Thanks to: facebookresearch, Camenduru, rkfg, oobabooga, AlexHK and GrandaddyShmax
+            """
+        )
+        with gr.Tab("MusicGen"):
+            gr.Markdown(
+                """
+                ### MusicGen
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    with gr.Tab("Generation"):
+                        with gr.Accordion("Structure Prompts", open=False):
+                            with gr.Column():
+                                with gr.Row():
+                                    struc_prompts = gr.Checkbox(label="Enable", value=False, interactive=True, container=False)
+                                    bpm = gr.Number(label="BPM", value=120, interactive=True, scale=1, precision=0)
+                                    key = gr.Dropdown(["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "Bb", "B"], label="Key", value="C", interactive=True)
+                                    scale = gr.Dropdown(["Major", "Minor"], label="Scale", value="Major", interactive=True)
+                                with gr.Row():
+                                    global_prompt = gr.Text(label="Global Prompt", interactive=True, scale=3)
+                        with gr.Row():
+                            s = gr.Slider(1, max_textboxes, value=1, step=1, label="Prompts:", interactive=True, scale=2)
+                            #s_mode = gr.Radio(["segmentation", "batch"], value="segmentation", interactive=True, scale=1, label="Generation Mode")
+                        with gr.Column():
+                            textboxes = []
+                            prompts = []
+                            repeats = []
+                            calcs = []
+                            with gr.Row():
+                                text0 = gr.Text(label="Input Text", interactive=True, scale=4)
+                                prompts.append(text0)
+                                drag0 = gr.Number(label="Repeat", value=1, interactive=True, scale=1)
+                                repeats.append(drag0)
+                                calc0 = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
+                                calcs.append(calc0)
+                            for i in range(max_textboxes):
+                                with gr.Row(visible=False) as t:
+                                    text = gr.Text(label="Input Text", interactive=True, scale=3)
+                                    repeat = gr.Number(label="Repeat", minimum=1, value=1, interactive=True, scale=1)
+                                    calc = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
+                                textboxes.append(t)
+                                prompts.append(text)
+                                repeats.append(repeat)
+                                calcs.append(calc)
+                            to_calc = gr.Button("Calculate Timings", variant="secondary")
+                        with gr.Row():
+                            duration = gr.Slider(minimum=1, maximum=300, value=10, step=1, label="Duration", interactive=True)
+                        with gr.Row():
+                            overlap = gr.Slider(minimum=1, maximum=29, value=12, step=1, label="Overlap", interactive=True)
+                        with gr.Row():
+                            seed = gr.Number(label="Seed", value=-1, scale=4, precision=0, interactive=True)
+                            gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[seed], queue=False)
+                            reuse_seed = gr.Button('\u267b\ufe0f', scale=1)
+                    with gr.Tab("Audio"):
+                        with gr.Row():
+                            with gr.Column():
+                                input_type = gr.Radio(["file", "mic"], value="file", label="Input Type (optional)", interactive=True)
+                                mode = gr.Radio(["melody", "sample"], label="Input Audio Mode (optional)", value="sample", interactive=True)
+                                with gr.Row():
+                                    trim_start = gr.Number(label="Trim Start", value=0, interactive=True)
+                                    trim_end = gr.Number(label="Trim End", value=0, interactive=True)
+                            audio = gr.Audio(source="upload", type="numpy", label="Input Audio (optional)", interactive=True)
+                    with gr.Tab("Customization"):
+                        with gr.Row():
+                            with gr.Column():
+                                background = gr.ColorPicker(value="#0f0f0f", label="background color", interactive=True, scale=0)
+                                bar1 = gr.ColorPicker(value="#84cc16", label="bar color start", interactive=True, scale=0)
+                                bar2 = gr.ColorPicker(value="#10b981", label="bar color end", interactive=True, scale=0)
+                            with gr.Column():
+                                image = gr.Image(label="Background Image", type="filepath", interactive=True, scale=4)
+                                with gr.Row():
+                                    height = gr.Number(label="Height", value=512, interactive=True)
+                                    width = gr.Number(label="Width", value=768, interactive=True)
+                    with gr.Tab("Settings"):
+                        with gr.Row():
+                            channel = gr.Radio(["mono", "stereo", "stereo effect"], label="Output Audio Channels", value="stereo", interactive=True, scale=1)
+                            sr_select = gr.Dropdown(["11025", "16000", "22050", "24000", "32000", "44100", "48000"], label="Output Audio Sample Rate", value="48000", interactive=True)
+                        with gr.Row():
+                            model = gr.Radio(["melody", "small", "medium", "large", "custom"], label="Model", value="large", interactive=True, scale=1)
+                            with gr.Column():
+                                dropdown = gr.Dropdown(choices=get_available_models(), value=("No models found" if len(get_available_models()) < 1 else get_available_models()[0]), label='Custom Model (models folder)', elem_classes='slim-dropdown', interactive=True)
+                                ui.create_refresh_button(dropdown, lambda: None, lambda: {'choices': get_available_models()}, 'refresh-button')
+                                basemodel = gr.Radio(["small", "medium", "melody", "large"], label="Base Model", value="medium", interactive=True, scale=1)
+                        with gr.Row():
+                            decoder = gr.Radio(["Default", "MultiBand_Diffusion"], label="Decoder", value="Default", interactive=True)
+                        with gr.Row():
+                            topk = gr.Number(label="Top-k", value=250, interactive=True)
+                            topp = gr.Number(label="Top-p", value=0, interactive=True)
+                            temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
+                            cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
+                    with gr.Row():
+                        submit = gr.Button("Generate", variant="primary")
+                        # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
+                        _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
+                with gr.Column() as c:
+                    with gr.Tab("Output"):
+                        output = gr.Video(label="Generated Music", scale=0)
+                        with gr.Row():
+                            audio_only = gr.Audio(type="numpy", label="Audio Only", interactive=False)
+                            backup_only = gr.Audio(type="numpy", label="Backup Audio", interactive=False, visible=False)
+                            send_audio = gr.Button("Send to Input Audio")
+                        seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
+                        download = gr.File(label="Generated Files", interactive=False)
+                    with gr.Tab("Wiki"):
+                        gr.Markdown(
+                            """
+                            - **[Generate (button)]:**
+                            Generates the music with the given settings and prompts.
+                            - **[Interrupt (button)]:**
+                            Stops the music generation as soon as it can, providing an incomplete output.
+                            ---
+                            ### Generation Tab:
+                            #### Structure Prompts:
+                            This feature helps reduce repetetive prompts by allowing you to set global prompts
+                            that will be used for all prompt segments.
+                            - **[Structure Prompts (checkbox)]:**
+                            Enable/Disable the structure prompts feature.
+                            - **[BPM (number)]:**
+                            Beats per minute of the generated music.
+                            - **[Key (dropdown)]:**
+                            The key of the generated music.
+                            - **[Scale (dropdown)]:**
+                            The scale of the generated music.
+                            - **[Global Prompt (text)]:**
+                            Here write the prompt that you wish to be used for all prompt segments.
+                            #### Multi-Prompt:
+                            This feature allows you to control the music, adding variation to different time segments.
+                            You have up to 10 prompt segments. the first prompt will always be 30s long
+                            the other prompts will be [30s - overlap].
+                            for example if the overlap is 10s, each prompt segment will be 20s.
+                            - **[Prompt Segments (number)]:**
+                            Amount of unique prompt to generate throughout the music generation.
+                            - **[Prompt/Input Text (prompt)]:**
+                            Here describe the music you wish the model to generate.
+                            - **[Repeat (number)]:**
+                            Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
+                            - **[Time (text)]:**
+                            The time of the prompt segment.
+                            - **[Calculate Timings (button)]:**
+                            Calculates the timings of the prompt segments.
+                            - **[Duration (number)]:**
+                            How long you want the generated music to be (in seconds).
+                            - **[Overlap (number)]:**
+                            How much each new segment will reference the previous segment (in seconds).
+                            For example, if you choose 20s: Each new segment after the first one will reference the previous segment 20s
+                            and will generate only 10s of new music. The model can only process 30s of music.
+                            - **[Seed (number)]:**
+                            Your generated music id. If you wish to generate the exact same music,
+                            place the exact seed with the exact prompts
+                            (This way you can also extend specific song that was generated short).
+                            - **[Random Seed (button)]:**
+                            Gives "-1" as a seed, which counts as a random seed.
+                            - **[Copy Previous Seed (button)]:**
+                            Copies the seed from the output seed (if you don't feel like doing it manualy).
+                            ---
+                            ### Audio Tab:
+                            - **[Input Type (selection)]:**
+                            `File` mode allows you to upload an audio file to use as input
+                            `Mic` mode allows you to use your microphone as input
+                            - **[Input Audio Mode (selection)]:**
+                            `Melody` mode only works with the melody model: it conditions the music generation to reference the melody
+                            `Sample` mode works with any model: it gives a music sample to the model to generate its continuation.
+                            - **[Trim Start and Trim End (numbers)]:**
+                            `Trim Start` set how much you'd like to trim the input audio from the start
+                            `Trim End` same as the above but from the end
+                            - **[Input Audio (audio file)]:**
+                            Input here the audio you wish to use with "melody" or "sample" mode.
+                            ---
+                            ### Customization Tab:
+                            - **[Background Color (color)]:**
+                            Works only if you don't upload image. Color of the background of the waveform.
+                            - **[Bar Color Start (color)]:**
+                            First color of the waveform bars.
+                            - **[Bar Color End (color)]:**
+                            Second color of the waveform bars.
+                            - **[Background Image (image)]:**
+                            Background image that you wish to be attached to the generated video along with the waveform.
+                            - **[Height and Width (numbers)]:**
+                            Output video resolution, only works with image.
+                            (minimum height and width is 256).
+                            ---
+                            ### Settings Tab:
+                            - **[Output Audio Channels (selection)]:**
+                            With this you can select the amount of channels that you wish for your output audio.
+                            `mono` is a straightforward single channel audio
+                            `stereo` is a dual channel audio but it will sound more or less like mono
+                            `stereo effect` this one is also dual channel but uses tricks to simulate a stereo audio.
+                            - **[Output Audio Sample Rate (dropdown)]:**
+                            The output audio sample rate, the model default is 32000.
+                            - **[Model (selection)]:**
+                            Here you can choose which model you wish to use:
+                            `melody` model is based on the medium model with a unique feature that lets you use melody conditioning
+                            `small` model is trained on 300M parameters
+                            `medium` model is trained on 1.5B parameters
+                            `large` model is trained on 3.3B parameters
+                            `custom` model runs the custom model that you provided.
+                            - **[Custom Model (selection)]:**
+                            This dropdown will show you models that are placed in the `models` folder
+                            you must select `custom` in the model options in order to use it.
+                            - **[Refresh (button)]:**
+                            Refreshes the dropdown list for custom model.
+                            - **[Base Model (selection)]:**
+                            Choose here the model that your custom model is based on.
+                            - **[Decoder (selection)]:**
+                            Choose here the decoder that you wish to use:
+                            `Default` is the default decoder
+                            `MultiBand_Diffusion` is a decoder that uses diffusion to generate the audio.
+                            - **[Top-k (number)]:**
+                            is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
+                            - **[Top-p (number)]:**
+                            also known as nucleus sampling or probabilistic sampling, is another method used for token selection during text generation. Instead of specifying a fixed number like top-k, top-p considers the cumulative probability distribution of the ranked tokens. It selects the smallest possible set of tokens whose cumulative probability exceeds a certain threshold (usually denoted as p). The model then samples from this set to choose the next token. This approach ensures that the generated output maintains a balance between diversity and coherence, as it allows for a varying number of tokens to be considered based on their probabilities.
+                            - **[Temperature (number)]:**
+                            is a parameter that controls the randomness of the generated output. It is applied during the sampling process, where a higher temperature value results in more random and diverse outputs, while a lower temperature value leads to more deterministic and focused outputs. In the context of music generation, a higher temperature can introduce more variability and creativity into the generated music, but it may also lead to less coherent or structured compositions. On the other hand, a lower temperature can produce more repetitive and predictable music.
+                            - **[Classifier Free Guidance (number)]:**
+                            refers to a technique used in some music generation models where a separate classifier network is trained to provide guidance or control over the generated music. This classifier is trained on labeled data to recognize specific musical characteristics or styles. During the generation process, the output of the generator model is evaluated by the classifier, and the generator is encouraged to produce music that aligns with the desired characteristics or style. This approach allows for more fine-grained control over the generated music, enabling users to specify certain attributes they want the model to capture.
+                            """
+                        )
+        with gr.Tab("AudioGen"):
+            gr.Markdown(
+                """
+                ### AudioGen
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    with gr.Tab("Generation"):
+                        with gr.Accordion("Structure Prompts", open=False):
+                            with gr.Row():
+                                struc_prompts_a = gr.Checkbox(label="Enable", value=False, interactive=True, container=False)
+                                global_prompt_a = gr.Text(label="Global Prompt", interactive=True, scale=3)
+                        with gr.Row():
+                            s_a = gr.Slider(1, max_textboxes, value=1, step=1, label="Prompts:", interactive=True, scale=2)
+                        with gr.Column():
+                            textboxes_a = []
+                            prompts_a = []
+                            repeats_a = []
+                            calcs_a = []
+                            with gr.Row():
+                                text0_a = gr.Text(label="Input Text", interactive=True, scale=4)
+                                prompts_a.append(text0_a)
+                                drag0_a = gr.Number(label="Repeat", value=1, interactive=True, scale=1)
+                                repeats_a.append(drag0_a)
+                                calc0_a = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
+                                calcs_a.append(calc0_a)
+                            for i in range(max_textboxes):
+                                with gr.Row(visible=False) as t_a:
+                                    text_a = gr.Text(label="Input Text", interactive=True, scale=3)
+                                    repeat_a = gr.Number(label="Repeat", minimum=1, value=1, interactive=True, scale=1)
+                                    calc_a = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
+                                textboxes_a.append(t_a)
+                                prompts_a.append(text_a)
+                                repeats_a.append(repeat_a)
+                                calcs_a.append(calc_a)
+                            to_calc_a = gr.Button("Calculate Timings", variant="secondary")
+                        with gr.Row():
+                            duration_a = gr.Slider(minimum=1, maximum=300, value=10, step=1, label="Duration", interactive=True)
+                        with gr.Row():
+                            overlap_a = gr.Slider(minimum=1, maximum=9, value=2, step=1, label="Overlap", interactive=True)
+                        with gr.Row():
+                            seed_a = gr.Number(label="Seed", value=-1, scale=4, precision=0, interactive=True)
+                            gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[seed_a], queue=False)
+                            reuse_seed_a = gr.Button('\u267b\ufe0f', scale=1)
+                    with gr.Tab("Audio"):
+                        with gr.Row():
+                            with gr.Column():
+                                input_type_a = gr.Radio(["file", "mic"], value="file", label="Input Type (optional)", interactive=True)
+                                mode_a = gr.Radio(["sample"], label="Input Audio Mode (optional)", value="sample", interactive=False, visible=False)
+                                with gr.Row():
+                                    trim_start_a = gr.Number(label="Trim Start", value=0, interactive=True)
+                                    trim_end_a = gr.Number(label="Trim End", value=0, interactive=True)
+                            audio_a = gr.Audio(source="upload", type="numpy", label="Input Audio (optional)", interactive=True)
+                    with gr.Tab("Customization"):
+                        with gr.Row():
+                            with gr.Column():
+                                background_a = gr.ColorPicker(value="#0f0f0f", label="background color", interactive=True, scale=0)
+                                bar1_a = gr.ColorPicker(value="#84cc16", label="bar color start", interactive=True, scale=0)
+                                bar2_a = gr.ColorPicker(value="#10b981", label="bar color end", interactive=True, scale=0)
+                            with gr.Column():
+                                image_a = gr.Image(label="Background Image", type="filepath", interactive=True, scale=4)
+                                with gr.Row():
+                                    height_a = gr.Number(label="Height", value=512, interactive=True)
+                                    width_a = gr.Number(label="Width", value=768, interactive=True)
+                    with gr.Tab("Settings"):
+                        with gr.Row():
+                            channel_a = gr.Radio(["mono", "stereo", "stereo effect"], label="Output Audio Channels", value="stereo", interactive=True, scale=1)
+                            sr_select_a = gr.Dropdown(["11025", "16000", "22050", "24000", "32000", "44100", "48000"], label="Output Audio Sample Rate", value="48000", interactive=True)
+                        with gr.Row():
+                            model_a = gr.Radio(["medium"], label="Model", value="medium", interactive=False, visible=False)
+                            decoder_a = gr.Radio(["Default"], label="Decoder", value="Default", interactive=False, visible=False)
+                        with gr.Row():
+                            topk_a = gr.Number(label="Top-k", value=250, interactive=True)
+                            topp_a = gr.Number(label="Top-p", value=0, interactive=True)
+                            temperature_a = gr.Number(label="Temperature", value=1.0, interactive=True)
+                            cfg_coef_a = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
+                    with gr.Row():
+                        submit_a = gr.Button("Generate", variant="primary")
+                        _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
+                with gr.Column():
+                    with gr.Tab("Output"):
+                        output_a = gr.Video(label="Generated Audio", scale=0)
+                        with gr.Row():
+                            audio_only_a = gr.Audio(type="numpy", label="Audio Only", interactive=False)
+                            backup_only_a = gr.Audio(type="numpy", label="Backup Audio", interactive=False, visible=False)
+                            send_audio_a = gr.Button("Send to Input Audio")
+                        seed_used_a = gr.Number(label='Seed used', value=-1, interactive=False)
+                        download_a = gr.File(label="Generated Files", interactive=False)
+                    with gr.Tab("Wiki"):
+                        gr.Markdown(
+                            """
+                            - **[Generate (button)]:**
+                            Generates the audio with the given settings and prompts.
+                            - **[Interrupt (button)]:**
+                            Stops the audio generation as soon as it can, providing an incomplete output.
+                            ---
+                            ### Generation Tab:
+                            #### Structure Prompts:
+                            This feature helps reduce repetetive prompts by allowing you to set global prompts
+                            that will be used for all prompt segments.
+                            - **[Structure Prompts (checkbox)]:**
+                            Enable/Disable the structure prompts feature.
+                            - **[Global Prompt (text)]:**
+                            Here write the prompt that you wish to be used for all prompt segments.
+                            #### Multi-Prompt:
+                            This feature allows you to control the audio, adding variation to different time segments.
+                            You have up to 10 prompt segments. the first prompt will always be 10s long
+                            the other prompts will be [10s - overlap].
+                            for example if the overlap is 2s, each prompt segment will be 8s.
+                            - **[Prompt Segments (number)]:**
+                            Amount of unique prompt to generate throughout the audio generation.
+                            - **[Prompt/Input Text (prompt)]:**
+                            Here describe the audio you wish the model to generate.
+                            - **[Repeat (number)]:**
+                            Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
+                            - **[Time (text)]:**
+                            The time of the prompt segment.
+                            - **[Calculate Timings (button)]:**
+                            Calculates the timings of the prompt segments.
+                            - **[Duration (number)]:**
+                            How long you want the generated audio to be (in seconds).
+                            - **[Overlap (number)]:**
+                            How much each new segment will reference the previous segment (in seconds).
+                            For example, if you choose 2s: Each new segment after the first one will reference the previous segment 2s
+                            and will generate only 8s of new audio. The model can only process 10s of music.
+                            - **[Seed (number)]:**
+                            Your generated audio id. If you wish to generate the exact same audio,
+                            place the exact seed with the exact prompts
+                            (This way you can also extend specific song that was generated short).
+                            - **[Random Seed (button)]:**
+                            Gives "-1" as a seed, which counts as a random seed.
+                            - **[Copy Previous Seed (button)]:**
+                            Copies the seed from the output seed (if you don't feel like doing it manualy).
+                            ---
+                            ### Audio Tab:
+                            - **[Input Type (selection)]:**
+                            `File` mode allows you to upload an audio file to use as input
+                            `Mic` mode allows you to use your microphone as input
+                            - **[Trim Start and Trim End (numbers)]:**
+                            `Trim Start` set how much you'd like to trim the input audio from the start
+                            `Trim End` same as the above but from the end
+                            - **[Input Audio (audio file)]:**
+                            Input here the audio you wish to use.
+                            ---
+                            ### Customization Tab:
+                            - **[Background Color (color)]:**
+                            Works only if you don't upload image. Color of the background of the waveform.
+                            - **[Bar Color Start (color)]:**
+                            First color of the waveform bars.
+                            - **[Bar Color End (color)]:**
+                            Second color of the waveform bars.
+                            - **[Background Image (image)]:**
+                            Background image that you wish to be attached to the generated video along with the waveform.
+                            - **[Height and Width (numbers)]:**
+                            Output video resolution, only works with image.
+                            (minimum height and width is 256).
+                            ---
+                            ### Settings Tab:
+                            - **[Output Audio Channels (selection)]:**
+                            With this you can select the amount of channels that you wish for your output audio.
+                            `mono` is a straightforward single channel audio
+                            `stereo` is a dual channel audio but it will sound more or less like mono
+                            `stereo effect` this one is also dual channel but uses tricks to simulate a stereo audio.
+                            - **[Output Audio Sample Rate (dropdown)]:**
+                            The output audio sample rate, the model default is 32000.
+                            - **[Top-k (number)]:**
+                            is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
+                            - **[Top-p (number)]:**
+                            also known as nucleus sampling or probabilistic sampling, is another method used for token selection during text generation. Instead of specifying a fixed number like top-k, top-p considers the cumulative probability distribution of the ranked tokens. It selects the smallest possible set of tokens whose cumulative probability exceeds a certain threshold (usually denoted as p). The model then samples from this set to choose the next token. This approach ensures that the generated output maintains a balance between diversity and coherence, as it allows for a varying number of tokens to be considered based on their probabilities.
+                            - **[Temperature (number)]:**
+                            is a parameter that controls the randomness of the generated output. It is applied during the sampling process, where a higher temperature value results in more random and diverse outputs, while a lower temperature value leads to more deterministic and focused outputs. In the context of music generation, a higher temperature can introduce more variability and creativity into the generated music, but it may also lead to less coherent or structured compositions. On the other hand, a lower temperature can produce more repetitive and predictable music.
+                            - **[Classifier Free Guidance (number)]:**
+                            refers to a technique used in some music generation models where a separate classifier network is trained to provide guidance or control over the generated music. This classifier is trained on labeled data to recognize specific musical characteristics or styles. During the generation process, the output of the generator model is evaluated by the classifier, and the generator is encouraged to produce music that aligns with the desired characteristics or style. This approach allows for more fine-grained control over the generated music, enabling users to specify certain attributes they want the model to capture.
+                            """
+                        )
+        with gr.Tab("Audio Info"):
+            gr.Markdown(
+                """
+                ### Audio Info
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    in_audio = gr.File(type="file", label="Input Any Audio", interactive=True)
+                    with gr.Row():
+                        send_gen = gr.Button("Send to MusicGen", variant="primary")
+                        send_gen_a = gr.Button("Send to AudioGen", variant="primary")
+                with gr.Column():
+                    info = gr.Textbox(label="Audio Info", lines=10, interactive=False)
+        with gr.Tab("Changelog"):
+            gr.Markdown(
+                            """
+                            ## Changelog:
+                            ### v2.0.0a
+                            - Forgot to move all the update to app.py from temp2.py... oops
+                            ### v2.0.0
+                            - Changed name from MusicGen+ to AudioCraft Plus
+                            - Complete overhaul of the repo "backend" with the latest changes from the main facebookresearch repo
+                            - Added a new decoder: MultiBand_Diffusion
+                            - Added AudioGen: a new tab for generating audio
+                            ### v1.2.8c
+                            - Implemented Reverse compatibility for audio info tab with previous versions
+                            ### v1.2.8b
+                            - Fixed the error when loading default models
+                            ### v1.2.8a
+                            - Adapted Audio info tab to work with the new structure prompts feature
+                            - Now custom models actually work, make sure you select the correct base model
+                            ### v1.2.8
+                            - Now you will also recieve json file with metadata of generated audio
+                            - Added error messages in Audio Info tab
+                            - Added structure prompts: you can select bpm, key and global prompt for all prompts
+                            - Added time display next to each prompt, can be calculated with "Calculate Timings" button
+                            ### v1.2.7
+                            - When sending generated audio to Input Audio, it will send a backup audio with default settings
+                            (best for continuos generation)
+                            - Added Metadata to generated audio (Thanks to AlexHK ♥)
+                            - Added Audio Info tab that will display the metadata of the input audio
+                            - Added "send to Text2Audio" button in Audio Info tab
+                            - Generated audio is now stored in the "output" folder (Thanks to AlexHK ♥)
+                            - Added an output area with generated files and download buttons
+                            - Enhanced Stereo effect (Thanks to AlexHK ♥)
+                            ### v1.2.6
+                            - Added option to generate in stereo (instead of only mono)
+                            - Added dropdown for selecting output sample rate (model default is 32000)
+                            ### v1.2.5a
+                            - Added file cleaner (This comes from the main facebookresearch repo)
+                            - Reorganized a little, moved audio to a seperate tab
+                            ### v1.2.5
+                            - Gave a unique lime theme to the webui
+                            - Added additional output for audio only
+                            - Added button to send generated audio to Input Audio
+                            - Added option to trim Input Audio
+                            ### v1.2.4
+                            - Added mic input (This comes from the main facebookresearch repo)
+                            ### v1.2.3
+                            - Added option to change video size to fit the image you upload
+                            ### v1.2.2
+                            - Added Wiki, Changelog and About tabs
+                            ### v1.2.1
+                            - Added tabs and organized the entire interface
+                            - Added option to attach image to the output video
+                            - Added option to load fine-tuned models (Yet to be tested)
+                            ### v1.2.0
+                            - Added Multi-Prompt
+                            ### v1.1.3
+                            - Added customization options for generated waveform
+                            ### v1.1.2
+                            - Removed sample length limit: now you can input audio of any length as music sample
+                            ### v1.1.1
+                            - Improved music sample audio quality when using music continuation
+                            ### v1.1.0
+                            - Rebuilt the repo on top of the latest structure of the main MusicGen repo
+                            - Improved Music continuation feature
+                            ### v1.0.0 - Stable Version
+                            - Added Music continuation
+                            """
+                        )
+        with gr.Tab("About"):
+            gen_type = gr.Text(value="music", interactive=False, visible=False)
+            gen_type_a = gr.Text(value="audio", interactive=False, visible=False)
+            gr.Markdown(
+                            """
+                            This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+                            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
+                            ## MusicGen+ is an extended version of the original MusicGen by facebookresearch.
+                            ### Repo: https://github.com/GrandaddyShmax/audiocraft_plus/tree/plus
+                            ---
+                            ### This project was possible thanks to:
+                            #### GrandaddyShmax - https://github.com/GrandaddyShmax
+                            #### Camenduru - https://github.com/camenduru
+                            #### rkfg - https://github.com/rkfg
+                            #### oobabooga - https://github.com/oobabooga
+                            #### AlexHK - https://github.com/alanhk147
+                            """
+                        )
+        send_gen.click(info_to_params, inputs=[in_audio], outputs=[decoder, struc_prompts, global_prompt, bpm, key, scale, model, dropdown, basemodel, s, prompts[0], prompts[1], prompts[2], prompts[3], prompts[4], prompts[5], prompts[6], prompts[7], prompts[8], prompts[9], repeats[0], repeats[1], repeats[2], repeats[3], repeats[4], repeats[5], repeats[6], repeats[7], repeats[8], repeats[9], mode, duration, topk, topp, temperature, cfg_coef, seed, overlap, channel, sr_select], queue=False)
+        reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False)
+        send_audio.click(fn=lambda x: x, inputs=[backup_only], outputs=[audio], queue=False)
+        submit.click(predict_full, inputs=[gen_type, model, decoder, dropdown, basemodel, s, struc_prompts, bpm, key, scale, global_prompt, prompts[0], prompts[1], prompts[2], prompts[3], prompts[4], prompts[5], prompts[6], prompts[7], prompts[8], prompts[9], repeats[0], repeats[1], repeats[2], repeats[3], repeats[4], repeats[5], repeats[6], repeats[7], repeats[8], repeats[9], audio, mode, trim_start, trim_end, duration, topk, topp, temperature, cfg_coef, seed, overlap, image, height, width, background, bar1, bar2, channel, sr_select], outputs=[output, audio_only, backup_only, download, seed_used])
+        input_type.change(toggle_audio_src, input_type, [audio], queue=False, show_progress=False)
+        to_calc.click(calc_time, inputs=[gen_type, s, duration, overlap, repeats[0], repeats[1], repeats[2], repeats[3], repeats[4], repeats[5], repeats[6], repeats[7], repeats[8], repeats[9]], outputs=[calcs[0], calcs[1], calcs[2], calcs[3], calcs[4], calcs[5], calcs[6], calcs[7], calcs[8], calcs[9]], queue=False)
+        send_gen_a.click(info_to_params_a, inputs=[in_audio], outputs=[decoder_a, struc_prompts_a, global_prompt_a, s_a, prompts_a[0], prompts_a[1], prompts_a[2], prompts_a[3], prompts_a[4], prompts_a[5], prompts_a[6], prompts_a[7], prompts_a[8], prompts_a[9], repeats_a[0], repeats_a[1], repeats_a[2], repeats_a[3], repeats_a[4], repeats_a[5], repeats_a[6], repeats_a[7], repeats_a[8], repeats_a[9], duration_a, topk_a, topp_a, temperature_a, cfg_coef_a, seed_a, overlap_a, channel_a, sr_select_a], queue=False)
+        reuse_seed_a.click(fn=lambda x: x, inputs=[seed_used_a], outputs=[seed_a], queue=False)
+        send_audio_a.click(fn=lambda x: x, inputs=[backup_only_a], outputs=[audio_a], queue=False)
+        submit_a.click(predict_full, inputs=[gen_type_a, model_a, decoder_a, dropdown, basemodel, s_a, struc_prompts_a, bpm, key, scale, global_prompt_a, prompts_a[0], prompts_a[1], prompts_a[2], prompts_a[3], prompts_a[4], prompts_a[5], prompts_a[6], prompts_a[7], prompts_a[8], prompts_a[9], repeats_a[0], repeats_a[1], repeats_a[2], repeats_a[3], repeats_a[4], repeats_a[5], repeats_a[6], repeats_a[7], repeats_a[8], repeats_a[9], audio_a, mode_a, trim_start_a, trim_end_a, duration_a, topk_a, topp_a, temperature_a, cfg_coef_a, seed_a, overlap_a, image_a, height_a, width_a, background_a, bar1_a, bar2_a, channel_a, sr_select_a], outputs=[output_a, audio_only_a, backup_only_a, download_a, seed_used_a])
+        input_type_a.change(toggle_audio_src, input_type_a, [audio_a], queue=False, show_progress=False)
+        to_calc_a.click(calc_time, inputs=[gen_type_a, s_a, duration_a, overlap_a, repeats_a[0], repeats_a[1], repeats_a[2], repeats_a[3], repeats_a[4], repeats_a[5], repeats_a[6], repeats_a[7], repeats_a[8], repeats_a[9]], outputs=[calcs_a[0], calcs_a[1], calcs_a[2], calcs_a[3], calcs_a[4], calcs_a[5], calcs_a[6], calcs_a[7], calcs_a[8], calcs_a[9]], queue=False)
+        in_audio.change(get_audio_info, in_audio, outputs=[info])
+        def variable_outputs(k):
+            k = int(k) - 1
+            return [gr.Textbox.update(visible=True)]*k + [gr.Textbox.update(visible=False)]*(max_textboxes-k)
+        def get_size(image):
+            if image is not None:
+                img = Image.open(image)
+                img_height = img.height
+                img_width = img.width
+                if (img_height%2) != 0:
+                    img_height = img_height + 1
+                if (img_width%2) != 0:
+                    img_width = img_width + 1
+                return img_height, img_width
+            else:
+                return 512, 768
+        image.change(get_size, image, outputs=[height, width])
+        image_a.change(get_size, image_a, outputs=[height_a, width_a])
+        s.change(variable_outputs, s, textboxes)
+        s_a.change(variable_outputs, s_a, textboxes_a)
+        interface.queue().launch(**launch_kwargs)
+def ui_batched(launch_kwargs):
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
+            a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
+            <br/>
+            <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
+                style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+                src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+            for longer sequences, more control and no queue.</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
+                    with gr.Column():
+                        radio = gr.Radio(["file", "mic"], value="file",
+                                         label="Condition on a melody (optional) File or Mic")
+                        melody = gr.Audio(source="upload", type="numpy", label="File",
+                                          interactive=True, elem_id="melody-input")
+                with gr.Row():
+                    submit = gr.Button("Generate")
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+                audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
+        submit.click(predict_batched, inputs=[text, melody],
+                     outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
+        radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
+        gr.Examples(
+            fn=predict_batched,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                ],
+            ],
+            inputs=[text, melody],
+            outputs=[output]
+        )
+        gr.Markdown("""
+        ### More details
+        The model will generate 12 seconds of audio based on the description you provided.
+        You can optionally provide a reference audio from which a broad melody will be extracted.
+        The model will then try to follow both the description and melody provided.
+        All samples are generated with the `melody` model.
+        You can also use your own GPU or a Google Colab by following the instructions on our repo.
+        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+        for more details.
+        """)
+        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    parser.add_argument(
+        '--unload_model', action='store_true', help='Unload the model after every generation to save GPU memory'
+    )
+    parser.add_argument(
+        '--unload_to_cpu', action='store_true', help='Move the model to main RAM after every generation to save GPU memory but reload faster than after full unload (see above)'
+    )
+    parser.add_argument(
+        '--cache', action='store_true', help='Cache models in RAM to quickly switch between them'
+    )
+    args = parser.parse_args()
+    UNLOAD_MODEL = args.unload_model
+    MOVE_TO_CPU = args.unload_to_cpu
+    if args.cache:
+        MODELS = {}
+    launch_kwargs = {}
+    launch_kwargs['server_name'] = args.listen
+    if args.username and args.password:
+        launch_kwargs['auth'] = (args.username, args.password)
+    if args.server_port:
+        launch_kwargs['server_port'] = args.server_port
+    if args.inbrowser:
+        launch_kwargs['inbrowser'] = args.inbrowser
+    if args.share:
+        launch_kwargs['share'] = args.share
+    # Show the interface
+    if IS_BATCHED:
+        global USE_DIFFUSION
+        USE_DIFFUSION = False
+        ui_batched(launch_kwargs)
+    else:
+        ui_full(launch_kwargs)

assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 ADDED Viewed

Binary file (15.2 kB). View file

assets/bach.mp3 ADDED Viewed

Binary file (160 kB). View file

assets/bolero_ravel.mp3 ADDED Viewed

Binary file (161 kB). View file

assets/sirens_and_a_humming_engine_approach_and_pass.mp3 ADDED Viewed

Binary file (15.2 kB). View file

audiocraft/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+AudioCraft is a general framework for training audio generative models.
+At the moment we provide the training code for:
+- [MusicGen](https://arxiv.org/abs/2306.05284), a state-of-the-art
+    text-to-music and melody+text autoregressive generative model.
+    For the solver, see `audiocraft.solvers.musicgen.MusicGenSolver`, and for the model,
+    `audiocraft.models.musicgen.MusicGen`.
+- [AudioGen](https://arxiv.org/abs/2209.15352), a state-of-the-art
+    text-to-general-audio generative model.
+- [EnCodec](https://arxiv.org/abs/2210.13438), efficient and high fidelity
+    neural audio codec which provides an excellent tokenizer for autoregressive language models.
+    See `audiocraft.solvers.compression.CompressionSolver`, and `audiocraft.models.encodec.EncodecModel`.
+- [MultiBandDiffusion](TODO), alternative diffusion-based decoder compatible with EnCodec that
+    improves the perceived quality and reduces the artifacts coming from adversarial decoders.
+"""
+# flake8: noqa
+from . import data, modules, models
+__version__ = '1.0.0'

audiocraft/adversarial/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Adversarial losses and discriminator architectures."""
+# flake8: noqa
+from .discriminators import (
+    MultiPeriodDiscriminator,
+    MultiScaleDiscriminator,
+    MultiScaleSTFTDiscriminator
+)
+from .losses import (
+    AdversarialLoss,
+    AdvLossType,
+    get_adv_criterion,
+    get_fake_criterion,
+    get_real_criterion,
+    FeatLossType,
+    FeatureMatchingLoss
+)

audiocraft/adversarial/discriminators/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# flake8: noqa
+from .mpd import MultiPeriodDiscriminator
+from .msd import MultiScaleDiscriminator
+from .msstftd import MultiScaleSTFTDiscriminator

audiocraft/adversarial/discriminators/base.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import typing as tp
+import torch
+import torch.nn as nn
+FeatureMapType = tp.List[torch.Tensor]
+LogitsType = torch.Tensor
+MultiDiscriminatorOutputType = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
+class MultiDiscriminator(ABC, nn.Module):
+    """Base implementation for discriminators composed of sub-discriminators acting at different scales.
+    """
+    def __init__(self):
+        super().__init__()
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        ...
+    @property
+    @abstractmethod
+    def num_discriminators(self) -> int:
+        """Number of discriminators.
+        """
+        ...

audiocraft/adversarial/discriminators/mpd.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+def get_padding(kernel_size: int, dilation: int = 1) -> int:
+    return int((kernel_size * dilation - dilation) / 2)
+class PeriodDiscriminator(nn.Module):
+    """Period sub-discriminator.
+    Args:
+        period (int): Period between samples of audio.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_layers (int): Number of convolutional layers.
+        kernel_sizes (list of int): Kernel sizes for convolutions.
+        stride (int): Stride for convolutions.
+        filters (int): Initial number of filters in convolutions.
+        filters_scale (int): Multiplier of number of filters as we increase depth.
+        max_filters (int): Maximum number of filters.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+    """
+    def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
+                 n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
+                 filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
+                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
+                 activation_params: dict = {'negative_slope': 0.2}):
+        super().__init__()
+        self.period = period
+        self.n_layers = n_layers
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        in_chs = in_channels
+        for i in range(self.n_layers):
+            out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
+            eff_stride = 1 if i == self.n_layers - 1 else stride
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
+                                         padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
+            in_chs = out_chs
+        self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
+                                    padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), 'reflect')
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for conv in self.convs:
+            x = conv(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(MultiDiscriminator):
+    """Multi-Period (MPD) Discriminator.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
+        **kwargs: Additional args for `PeriodDiscriminator`
+    """
+    def __init__(self, in_channels: int = 1, out_channels: int = 1,
+                 periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
+        ])
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/discriminators/msd.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import numpy as np
+import torch
+import torch.nn as nn
+from ...modules import NormConv1d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+class ScaleDiscriminator(nn.Module):
+    """Waveform sub-discriminator.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
+        filters (int): Number of initial filters for convolutions.
+        max_filters (int): Maximum number of filters.
+        downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
+        inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
+        groups (Sequence[int] or None): Groups for inner convolutions.
+        strides (Sequence[int] or None): Strides for inner convolutions.
+        paddings (Sequence[int] or None): Paddings for inner convolutions.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        pad (str): Padding for initial convolution.
+        pad_params (dict): Parameters to provide to the padding module.
+    """
+    def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
+                 filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
+                 inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
+                 strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
+                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
+                 activation_params: dict = {'negative_slope': 0.2}, pad: str = 'ReflectionPad1d',
+                 pad_params: dict = {}):
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+        assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
+        assert (groups is None or len(groups) == len(downsample_scales))
+        assert (strides is None or len(strides) == len(downsample_scales))
+        assert (paddings is None or len(paddings) == len(downsample_scales))
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
+            )
+        )
+        in_chs = filters
+        for i, downsample_scale in enumerate(downsample_scales):
+            out_chs = min(in_chs * downsample_scale, max_filters)
+            default_kernel_size = downsample_scale * 10 + 1
+            default_stride = downsample_scale
+            default_padding = (default_kernel_size - 1) // 2
+            default_groups = in_chs // 4
+            self.convs.append(
+                NormConv1d(in_chs, out_chs,
+                           kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
+                           stride=strides[i] if strides else default_stride,
+                           groups=groups[i] if groups else default_groups,
+                           padding=paddings[i] if paddings else default_padding,
+                           norm=norm))
+            in_chs = out_chs
+        out_chs = min(in_chs * 2, max_filters)
+        self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
+                                     padding=(kernel_sizes[0] - 1) // 2, norm=norm))
+        self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
+                                    padding=(kernel_sizes[1] - 1) // 2, norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(MultiDiscriminator):
+    """Multi-Scale (MSD) Discriminator,
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        downsample_factor (int): Downsampling factor between the different scales.
+        scale_norms (Sequence[str]): Normalization for each sub-discriminator.
+        **kwargs: Additional args for ScaleDiscriminator.
+    """
+    def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
+                 scale_norms: tp.Sequence[str] = ['weight_norm', 'weight_norm', 'weight_norm'], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
+        ])
+        self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for i, disc in enumerate(self.discriminators):
+            if i != 0:
+                self.downsample(x)
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/discriminators/msstftd.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import torchaudio
+import torch
+from torch import nn
+from einops import rearrange
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
+    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
+class DiscriminatorSTFT(nn.Module):
+    """STFT sub-discriminator.
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_fft (int): Size of FFT for each scale.
+        hop_length (int): Length of hop between STFT windows for each scale.
+        kernel_size (tuple of int): Inner Conv2d kernel sizes.
+        stride (tuple of int): Inner Conv2d strides.
+        dilations (list of int): Inner Conv2d dilation on the time dimension.
+        win_length (int): Window size for each scale.
+        normalized (bool): Whether to normalize by magnitude after stft.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        growth (int): Growth factor for the filters.
+    """
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
+                 n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
+                 filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
+                 stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = 'weight_norm',
+                 activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2}):
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        self.filters = filters
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.spec_transform = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
+            normalized=self.normalized, center=False, pad_mode=None, power=None)
+        spec_channels = 2 * self.in_channels
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
+        )
+        in_chs = min(filters_scale * self.filters, max_filters)
+        for i, dilation in enumerate(dilations):
+            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
+                                         dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
+                                         norm=norm))
+            in_chs = out_chs
+        out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
+        self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
+                                     padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                     norm=norm))
+        self.conv_post = NormConv2d(out_chs, self.out_channels,
+                                    kernel_size=(kernel_size[0], kernel_size[0]),
+                                    padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                    norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
+        z = torch.cat([z.real, z.imag], dim=1)
+        z = rearrange(z, 'b c w t -> b c t w')
+        for i, layer in enumerate(self.convs):
+            z = layer(z)
+            z = self.activation(z)
+            fmap.append(z)
+        z = self.conv_post(z)
+        return z, fmap
+class MultiScaleSTFTDiscriminator(MultiDiscriminator):
+    """Multi-Scale STFT (MS-STFT) discriminator.
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        sep_channels (bool): Separate channels to distinct samples for stereo support.
+        n_ffts (Sequence[int]): Size of FFT for each scale.
+        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
+        win_lengths (Sequence[int]): Window size for each scale.
+        **kwargs: Additional args for STFTDiscriminator.
+    """
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
+                 n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
+                 win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.sep_channels = sep_channels
+        self.discriminators = nn.ModuleList([
+            DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
+                              n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
+            for i in range(len(n_ffts))
+        ])
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def _separate_channels(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, T = x.shape
+        return x.view(-1, 1, T)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/losses.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Utility module to handle adversarial losses without requiring to mess up the main training loop.
+"""
+import typing as tp
+import flashy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+ADVERSARIAL_LOSSES = ['mse', 'hinge', 'hinge2']
+AdvLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor], torch.Tensor]]
+FeatLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
+class AdversarialLoss(nn.Module):
+    """Adversary training wrapper.
+    Args:
+        adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
+            We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
+            where the first item is a list of logits and the second item is a list of feature maps.
+        optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
+        loss (AdvLossType): Loss function for generator training.
+        loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
+        loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
+        loss_feat (FeatLossType): Feature matching loss function for generator training.
+        normalize (bool): Whether to normalize by number of sub-discriminators.
+    Example of usage:
+        adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
+        for real in loader:
+            noise = torch.randn(...)
+            fake = model(noise)
+            adv_loss.train_adv(fake, real)
+            loss, _ = adv_loss(fake, real)
+            loss.backward()
+    """
+    def __init__(self,
+                 adversary: nn.Module,
+                 optimizer: torch.optim.Optimizer,
+                 loss: AdvLossType,
+                 loss_real: AdvLossType,
+                 loss_fake: AdvLossType,
+                 loss_feat: tp.Optional[FeatLossType] = None,
+                 normalize: bool = True):
+        super().__init__()
+        self.adversary: nn.Module = adversary
+        flashy.distrib.broadcast_model(self.adversary)
+        self.optimizer = optimizer
+        self.loss = loss
+        self.loss_real = loss_real
+        self.loss_fake = loss_fake
+        self.loss_feat = loss_feat
+        self.normalize = normalize
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # Add the optimizer state dict inside our own.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'optimizer'] = self.optimizer.state_dict()
+        return destination
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Load optimizer state.
+        self.optimizer.load_state_dict(state_dict.pop(prefix + 'optimizer'))
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def get_adversary_pred(self, x):
+        """Run adversary model, validating expected output format."""
+        logits, fmaps = self.adversary(x)
+        assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
+            f'Expecting a list of tensors as logits but {type(logits)} found.'
+        assert isinstance(fmaps, list), f'Expecting a list of features maps but {type(fmaps)} found.'
+        for fmap in fmaps:
+            assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
+                f'Expecting a list of tensors as feature maps but {type(fmap)} found.'
+        return logits, fmaps
+    def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -> torch.Tensor:
+        """Train the adversary with the given fake and real example.
+        We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+        The first item being the logits and second item being a list of feature maps for each sub-discriminator.
+        This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
+        and call the optimizer.
+        """
+        loss = torch.tensor(0., device=fake.device)
+        all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
+        all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
+        n_sub_adversaries = len(all_logits_fake_is_fake)
+        for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
+            loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
+        if self.normalize:
+            loss /= n_sub_adversaries
+        self.optimizer.zero_grad()
+        with flashy.distrib.eager_sync_model(self.adversary):
+            loss.backward()
+        self.optimizer.step()
+        return loss
+    def forward(self, fake: torch.Tensor, real: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        """Return the loss for the generator, i.e. trying to fool the adversary,
+        and feature matching loss if provided.
+        """
+        adv = torch.tensor(0., device=fake.device)
+        feat = torch.tensor(0., device=fake.device)
+        with flashy.utils.readonly(self.adversary):
+            all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
+            all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
+            n_sub_adversaries = len(all_logits_fake_is_fake)
+            for logit_fake_is_fake in all_logits_fake_is_fake:
+                adv += self.loss(logit_fake_is_fake)
+            if self.loss_feat:
+                for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
+                    feat += self.loss_feat(fmap_fake, fmap_real)
+        if self.normalize:
+            adv /= n_sub_adversaries
+            feat /= n_sub_adversaries
+        return adv, feat
+def get_adv_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_loss
+    elif loss_type == 'hinge':
+        return hinge_loss
+    elif loss_type == 'hinge2':
+        return hinge2_loss
+    raise ValueError('Unsupported loss')
+def get_fake_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_fake_loss
+    elif loss_type in ['hinge', 'hinge2']:
+        return hinge_fake_loss
+    raise ValueError('Unsupported loss')
+def get_real_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_real_loss
+    elif loss_type in ['hinge', 'hinge2']:
+        return hinge_real_loss
+    raise ValueError('Unsupported loss')
+def mse_real_loss(x: torch.Tensor) -> torch.Tensor:
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+def mse_fake_loss(x: torch.Tensor) -> torch.Tensor:
+    return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))
+def hinge_real_loss(x: torch.Tensor) -> torch.Tensor:
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+def hinge_fake_loss(x: torch.Tensor) -> torch.Tensor:
+    return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+def mse_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+def hinge_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return -x.mean()
+def hinge2_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0])
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+class FeatureMatchingLoss(nn.Module):
+    """Feature matching loss for adversarial training.
+    Args:
+        loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
+        normalize (bool): Whether to normalize the loss.
+            by number of feature maps.
+    """
+    def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
+        super().__init__()
+        self.loss = loss
+        self.normalize = normalize
+    def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -> torch.Tensor:
+        assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) > 0
+        feat_loss = torch.tensor(0., device=fmap_fake[0].device)
+        feat_scale = torch.tensor(0., device=fmap_fake[0].device)
+        n_fmaps = 0
+        for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
+            assert feat_fake.shape == feat_real.shape
+            n_fmaps += 1
+            feat_loss += self.loss(feat_fake, feat_real)
+            feat_scale += torch.mean(torch.abs(feat_real))
+        if self.normalize:
+            feat_loss /= n_fmaps
+        return feat_loss

audiocraft/data/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Audio loading and writing support. Datasets for raw audio
+or also including some metadata."""
+# flake8: noqa
+from . import audio, audio_dataset, info_audio_dataset, music_dataset, sound_dataset

audiocraft/data/audio.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.
+"""
+from dataclasses import dataclass
+from pathlib import Path
+import logging
+import typing as tp
+import numpy as np
+import soundfile
+import torch
+from torch.nn import functional as F
+import torchaudio as ta
+import av
+from .audio_utils import f32_pcm, i16_pcm, normalize_audio
+_av_initialized = False
+def _init_av():
+    global _av_initialized
+    if _av_initialized:
+        return
+    logger = logging.getLogger('libav.mp3')
+    logger.setLevel(logging.ERROR)
+    _av_initialized = True
+@dataclass(frozen=True)
+class AudioFileInfo:
+    sample_rate: int
+    duration: float
+    channels: int
+def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sample_rate = stream.codec_context.sample_rate
+        duration = float(stream.duration * stream.time_base)
+        channels = stream.channels
+        return AudioFileInfo(sample_rate, duration, channels)
+def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
+    info = soundfile.info(filepath)
+    return AudioFileInfo(info.samplerate, info.duration, info.channels)
+def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
+    # torchaudio no longer returns useful duration informations for some formats like mp3s.
+    filepath = Path(filepath)
+    if filepath.suffix in ['.flac', '.ogg']:  # TODO: Validate .ogg can be safely read with av_info
+        # ffmpeg has some weird issue with flac.
+        return _soundfile_info(filepath)
+    else:
+        return _av_info(filepath)
+def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
+    """FFMPEG-based audio file reading using PyAV bindings.
+    Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
+    """
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sr = stream.codec_context.sample_rate
+        num_frames = int(sr * duration) if duration >= 0 else -1
+        frame_offset = int(sr * seek_time)
+        # we need a small negative offset otherwise we get some edge artifact
+        # from the mp3 decoder.
+        af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
+        frames = []
+        length = 0
+        for frame in af.decode(streams=stream.index):
+            current_offset = int(frame.rate * frame.pts * frame.time_base)
+            strip = max(0, frame_offset - current_offset)
+            buf = torch.from_numpy(frame.to_ndarray())
+            if buf.shape[0] != stream.channels:
+                buf = buf.view(-1, stream.channels).t()
+            buf = buf[:, strip:]
+            frames.append(buf)
+            length += buf.shape[1]
+            if num_frames > 0 and length >= num_frames:
+                break
+        assert frames
+        # If the above assert fails, it is likely because we seeked past the end of file point,
+        # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
+        # This will need proper debugging, in due time.
+        wav = torch.cat(frames, dim=1)
+        assert wav.shape[0] == stream.channels
+        if num_frames > 0:
+            wav = wav[:, :num_frames]
+        return f32_pcm(wav), sr
+def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
+    """Read audio by picking the most appropriate backend tool based on the audio format.
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
+    """
+    fp = Path(filepath)
+    if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = _soundfile_info(filepath)
+        frames = -1 if duration <= 0 else int(duration * info.sample_rate)
+        frame_offset = int(seek_time * info.sample_rate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}"
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    elif (
+        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
+        and duration <= 0 and seek_time == 0
+    ):
+        # Torchaudio is faster if we load an entire file at once.
+        wav, sr = ta.load(fp)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration > 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
+    return wav, sr
+def audio_write(stem_name: tp.Union[str, Path],
+                wav: torch.Tensor, sample_rate: int,
+                format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
+                strategy: str = 'peak', peak_clip_headroom_db: float = 1,
+                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
+                log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True) -> Path:
+    """Convenience function for saving audio to disk. Returns the filename the audio was written to.
+    Args:
+        stem_name (str or Path): Filename without extension which will be added automatically.
+        format (str): Either "wav" or "mp3".
+        mp3_rate (int): kbps when using mp3s.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. 'clip' just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
+        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
+         when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for 'rms').
+        make_parent_dir (bool): Make parent directory if it doesn't exist.
+    Returns:
+        Path: Path of the saved audio.
+    """
+    assert wav.dtype.is_floating_point, "wav is not floating point"
+    if wav.dim() == 1:
+        wav = wav[None]
+    elif wav.dim() > 2:
+        raise ValueError("Input wav should be at most 2 dimension.")
+    assert wav.isfinite().all()
+    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
+                          log_clipping=log_clipping, sample_rate=sample_rate,
+                          stem_name=str(stem_name))
+    kwargs: dict = {}
+    if format == 'mp3':
+        suffix = '.mp3'
+        kwargs.update({"compression": mp3_rate})
+    elif format == 'wav':
+        wav = i16_pcm(wav)
+        suffix = '.wav'
+        kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
+    else:
+        raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
+    if not add_suffix:
+        suffix = ''
+    path = Path(str(stem_name) + suffix)
+    if make_parent_dir:
+        path.parent.mkdir(exist_ok=True, parents=True)
+    try:
+        ta.save(path, wav, sample_rate, **kwargs)
+    except Exception:
+        if path.exists():
+            # we do not want to leave half written files around.
+            path.unlink()
+        raise
+    return path

audiocraft/data/audio_dataset.py ADDED Viewed

	@@ -0,0 +1,587 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, sample rate, duration), and use that to efficiently sample audio segments.
+"""
+import argparse
+import copy
+from concurrent.futures import ThreadPoolExecutor, Future
+from dataclasses import dataclass, fields
+from contextlib import ExitStack
+from functools import lru_cache
+import gzip
+import json
+import logging
+import os
+from pathlib import Path
+import random
+import sys
+import typing as tp
+import torch
+import torch.nn.functional as F
+from .audio import audio_read, audio_info
+from .audio_utils import convert_audio
+from .zip import PathInZip
+try:
+    import dora
+except ImportError:
+    dora = None  # type: ignore
+@dataclass(order=True)
+class BaseInfo:
+    @classmethod
+    def _dict2fields(cls, dictionary: dict):
+        return {
+            field.name: dictionary[field.name]
+            for field in fields(cls) if field.name in dictionary
+        }
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        _dictionary = cls._dict2fields(dictionary)
+        return cls(**_dictionary)
+    def to_dict(self):
+        return {
+            field.name: self.__getattribute__(field.name)
+            for field in fields(self)
+            }
+@dataclass(order=True)
+class AudioMeta(BaseInfo):
+    path: str
+    duration: float
+    sample_rate: int
+    amplitude: tp.Optional[float] = None
+    weight: tp.Optional[float] = None
+    # info_path is used to load additional information about the audio file that is stored in zip files.
+    info_path: tp.Optional[PathInZip] = None
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        base = cls._dict2fields(dictionary)
+        if 'info_path' in base and base['info_path'] is not None:
+            base['info_path'] = PathInZip(base['info_path'])
+        return cls(**base)
+    def to_dict(self):
+        d = super().to_dict()
+        if d['info_path'] is not None:
+            d['info_path'] = str(d['info_path'])
+        return d
+@dataclass(order=True)
+class SegmentInfo(BaseInfo):
+    meta: AudioMeta
+    seek_time: float
+    # The following values are given once the audio is processed, e.g.
+    # at the target sample rate and target number of channels.
+    n_frames: int      # actual number of frames without padding
+    total_frames: int  # total number of frames, padding included
+    sample_rate: int   # actual sample rate
+    channels: int      # number of audio channels.
+DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
+logger = logging.getLogger(__name__)
+def _get_audio_meta(file_path: str, minimal: bool = True) -> AudioMeta:
+    """AudioMeta from a path to an audio file.
+    Args:
+        file_path (str): Resolved path of valid audio file.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+    Returns:
+        AudioMeta: Audio file path and its metadata.
+    """
+    info = audio_info(file_path)
+    amplitude: tp.Optional[float] = None
+    if not minimal:
+        wav, sr = audio_read(file_path)
+        amplitude = wav.abs().max().item()
+    return AudioMeta(file_path, info.duration, info.sample_rate, amplitude)
+def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -> AudioMeta:
+    """If Dora is available as a dependency, try to resolve potential relative paths
+    in list of AudioMeta. This method is expected to be used when loading meta from file.
+    Args:
+        m (AudioMeta): Audio meta to resolve.
+        fast (bool): If True, uses a really fast check for determining if a file
+            is already absolute or not. Only valid on Linux/Mac.
+    Returns:
+        AudioMeta: Audio meta with resolved path.
+    """
+    def is_abs(m):
+        if fast:
+            return str(m)[0] == '/'
+        else:
+            os.path.isabs(str(m))
+    if not dora:
+        return m
+    if not is_abs(m.path):
+        m.path = dora.git_save.to_absolute_path(m.path)
+    if m.info_path is not None and not is_abs(m.info_path.zip_path):
+        m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
+    return m
+def find_audio_files(path: tp.Union[Path, str],
+                     exts: tp.List[str] = DEFAULT_EXTS,
+                     resolve: bool = True,
+                     minimal: bool = True,
+                     progress: bool = False,
+                     workers: int = 0) -> tp.List[AudioMeta]:
+    """Build a list of AudioMeta from a given path,
+    collecting relevant audio files and fetching meta info.
+    Args:
+        path (str or Path): Path to folder containing audio files.
+        exts (list of str): List of file extensions to consider for audio files.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+        progress (bool): Whether to log progress on audio files collection.
+        workers (int): number of parallel workers, if 0, use only the current thread.
+    Returns:
+        list of AudioMeta: List of audio file path and its metadata.
+    """
+    audio_files = []
+    futures: tp.List[Future] = []
+    pool: tp.Optional[ThreadPoolExecutor] = None
+    with ExitStack() as stack:
+        if workers > 0:
+            pool = ThreadPoolExecutor(workers)
+            stack.enter_context(pool)
+        if progress:
+            print("Finding audio files...")
+        for root, folders, files in os.walk(path, followlinks=True):
+            for file in files:
+                full_path = Path(root) / file
+                if full_path.suffix.lower() in exts:
+                    audio_files.append(full_path)
+                    if pool is not None:
+                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
+                    if progress:
+                        print(format(len(audio_files), " 8d"), end='\r', file=sys.stderr)
+        if progress:
+            print("Getting audio metadata...")
+        meta: tp.List[AudioMeta] = []
+        for idx, file_path in enumerate(audio_files):
+            try:
+                if pool is None:
+                    m = _get_audio_meta(str(file_path), minimal)
+                else:
+                    m = futures[idx].result()
+                if resolve:
+                    m = _resolve_audio_meta(m)
+            except Exception as err:
+                print("Error with", str(file_path), err, file=sys.stderr)
+                continue
+            meta.append(m)
+            if progress:
+                print(format((1 + idx) / len(audio_files), " 3.1%"), end='\r', file=sys.stderr)
+    meta.sort()
+    return meta
+def load_audio_meta(path: tp.Union[str, Path],
+                    resolve: bool = True, fast: bool = True) -> tp.List[AudioMeta]:
+    """Load list of AudioMeta from an optionally compressed json file.
+    Args:
+        path (str or Path): Path to JSON file.
+        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
+        fast (bool): activates some tricks to make things faster.
+    Returns:
+        list of AudioMeta: List of audio file path and its total duration.
+    """
+    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
+    with open_fn(path, 'rb') as fp:  # type: ignore
+        lines = fp.readlines()
+    meta = []
+    for line in lines:
+        d = json.loads(line)
+        m = AudioMeta.from_dict(d)
+        if resolve:
+            m = _resolve_audio_meta(m, fast=fast)
+        meta.append(m)
+    return meta
+def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
+    """Save the audio metadata to the file pointer as json.
+    Args:
+        path (str or Path): Path to JSON file.
+        metadata (list of BaseAudioMeta): List of audio meta to save.
+    """
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
+    with open_fn(path, 'wb') as fp:  # type: ignore
+        for m in meta:
+            json_str = json.dumps(m.to_dict()) + '\n'
+            json_bytes = json_str.encode('utf-8')
+            fp.write(json_bytes)
+class AudioDataset:
+    """Base audio dataset.
+    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+    and potentially additional information, by creating random segments from the list of audio
+    files referenced in the metadata and applying minimal data pre-processing such as resampling,
+    mixing of channels, padding, etc.
+    If no segment_duration value is provided, the AudioDataset will return the full wav for each
+    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+    duration, applying padding if required.
+    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+    original audio meta.
+    Note that you can call `start_epoch(epoch)` in order to get
+    a deterministic "randomization" for `shuffle=True`.
+    For a given epoch and dataset index, this will always return the same extract.
+    You can get back some diversity by setting the `shuffle_seed` param.
+    Args:
+        meta (list of AudioMeta): List of audio files metadata.
+        segment_duration (float, optional): Optional segment duration of audio to load.
+            If not specified, the dataset will load the full audio segment from the file.
+        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
+        sample_rate (int): Target sample rate of the loaded audio samples.
+        channels (int): Target number of channels of the loaded audio samples.
+        sample_on_duration (bool): Set to `True` to sample segments with probability
+            dependent on audio file duration. This is only used if `segment_duration` is provided.
+        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
+            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
+            of the file duration and file weight. This is only used if `segment_duration` is provided.
+        min_segment_ratio (float): Minimum segment ratio to use when the audio file
+            is shorter than the desired segment.
+        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
+        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
+            audio shorter than this will be filtered out.
+        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
+            audio longer than this will be filtered out.
+        shuffle_seed (int): can be used to further randomize
+        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
+            with the expected segment_duration (which must be provided if load_wav is False).
+        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
+            are False. Will ensure a permutation on files when going through the dataset.
+            In that case the epoch number must be provided in order for the model
+            to continue the permutation across epochs. In that case, it is assumed
+            that `num_samples = total_batch_size * num_updates_per_epoch`, with
+            `total_batch_size` the overall batch size accounting for all gpus.
+    """
+    def __init__(self,
+                 meta: tp.List[AudioMeta],
+                 segment_duration: tp.Optional[float] = None,
+                 shuffle: bool = True,
+                 num_samples: int = 10_000,
+                 sample_rate: int = 48_000,
+                 channels: int = 2,
+                 pad: bool = True,
+                 sample_on_duration: bool = True,
+                 sample_on_weight: bool = True,
+                 min_segment_ratio: float = 0.5,
+                 max_read_retry: int = 10,
+                 return_info: bool = False,
+                 min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None,
+                 shuffle_seed: int = 0,
+                 load_wav: bool = True,
+                 permutation_on_files: bool = False,
+                 ):
+        assert len(meta) > 0, "No audio meta provided to AudioDataset. Please check loading of audio meta."
+        assert segment_duration is None or segment_duration > 0
+        assert segment_duration is None or min_segment_ratio >= 0
+        self.segment_duration = segment_duration
+        self.min_segment_ratio = min_segment_ratio
+        self.max_audio_duration = max_audio_duration
+        self.min_audio_duration = min_audio_duration
+        if self.min_audio_duration is not None and self.max_audio_duration is not None:
+            assert self.min_audio_duration <= self.max_audio_duration
+        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
+        assert len(self.meta)  # Fail fast if all data has been filtered.
+        self.total_duration = sum(d.duration for d in self.meta)
+        if segment_duration is None:
+            num_samples = len(self.meta)
+        self.num_samples = num_samples
+        self.shuffle = shuffle
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.pad = pad
+        self.sample_on_weight = sample_on_weight
+        self.sample_on_duration = sample_on_duration
+        self.sampling_probabilities = self._get_sampling_probabilities()
+        self.max_read_retry = max_read_retry
+        self.return_info = return_info
+        self.shuffle_seed = shuffle_seed
+        self.current_epoch: tp.Optional[int] = None
+        self.load_wav = load_wav
+        if not load_wav:
+            assert segment_duration is not None
+        self.permutation_on_files = permutation_on_files
+        if permutation_on_files:
+            assert not self.sample_on_duration
+            assert not self.sample_on_weight
+            assert self.shuffle
+    def start_epoch(self, epoch: int):
+        self.current_epoch = epoch
+    def __len__(self):
+        return self.num_samples
+    def _get_sampling_probabilities(self, normalized: bool = True):
+        """Return the sampling probabilities for each file inside `self.meta`."""
+        scores: tp.List[float] = []
+        for file_meta in self.meta:
+            score = 1.
+            if self.sample_on_weight and file_meta.weight is not None:
+                score *= file_meta.weight
+            if self.sample_on_duration:
+                score *= file_meta.duration
+            scores.append(score)
+        probabilities = torch.tensor(scores)
+        if normalized:
+            probabilities /= probabilities.sum()
+        return probabilities
+    @staticmethod
+    @lru_cache(16)
+    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
+        # Used to keep the most recent files permutation in memory implicitely.
+        # will work unless someone is using a lot of Datasets in parallel.
+        rng = torch.Generator()
+        rng.manual_seed(base_seed + permutation_index)
+        return torch.randperm(num_files, generator=rng)
+    def sample_file(self, index: int, rng: torch.Generator) -> AudioMeta:
+        """Sample a given file from `self.meta`. Can be overridden in subclasses.
+        This is only called if `segment_duration` is not None.
+        You must use the provided random number generator `rng` for reproducibility.
+        You can further make use of the index accessed.
+        """
+        if self.permutation_on_files:
+            assert self.current_epoch is not None
+            total_index = self.current_epoch * len(self) + index
+            permutation_index = total_index // len(self.meta)
+            relative_index = total_index % len(self.meta)
+            permutation = AudioDataset._get_file_permutation(
+                len(self.meta), permutation_index, self.shuffle_seed)
+            file_index = permutation[relative_index]
+            return self.meta[file_index]
+        if not self.sample_on_weight and not self.sample_on_duration:
+            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+        else:
+            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+        return self.meta[file_index]
+    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
+        # Override this method in subclass if needed.
+        if self.load_wav:
+            return audio_read(path, seek_time, duration, pad=False)
+        else:
+            assert self.segment_duration is not None
+            n_frames = int(self.sample_rate * self.segment_duration)
+            return torch.zeros(self.channels, n_frames), self.sample_rate
+    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
+        if self.segment_duration is None:
+            file_meta = self.meta[index]
+            out, sr = audio_read(file_meta.path)
+            out = convert_audio(out, sr, self.sample_rate, self.channels)
+            n_frames = out.shape[-1]
+            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate, channels=out.shape[0])
+        else:
+            rng = torch.Generator()
+            if self.shuffle:
+                # We use index, plus extra randomness, either totally random if we don't know the epoch.
+                # otherwise we make use of the epoch number and optional shuffle_seed.
+                if self.current_epoch is None:
+                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+                else:
+                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
+            else:
+                # We only use index
+                rng.manual_seed(index)
+            for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(index, rng)
+                # We add some variance in the file position even if audio file is smaller than segment
+                # without ending up with empty segments
+                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
+                seek_time = torch.rand(1, generator=rng).item() * max_seek
+                try:
+                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
+                    out = convert_audio(out, sr, self.sample_rate, self.channels)
+                    n_frames = out.shape[-1]
+                    target_frames = int(self.segment_duration * self.sample_rate)
+                    if self.pad:
+                        out = F.pad(out, (0, target_frames - n_frames))
+                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate, channels=out.shape[0])
+                except Exception as exc:
+                    logger.warning("Error opening file %s: %r", file_meta.path, exc)
+                    if retry == self.max_read_retry - 1:
+                        raise
+                else:
+                    break
+        if self.return_info:
+            # Returns the wav and additional information on the wave segment
+            return out, segment_info
+        else:
+            return out
+    def collater(self, samples):
+        """The collater function has to be provided to the dataloader
+        if AudioDataset has return_info=True in order to properly collate
+        the samples of a batch.
+        """
+        if self.segment_duration is None and len(samples) > 1:
+            assert self.pad, "Must allow padding when batching examples of different durations."
+        # In this case the audio reaching the collater is of variable length as segment_duration=None.
+        to_pad = self.segment_duration is None and self.pad
+        if to_pad:
+            max_len = max([wav.shape[-1] for wav, _ in samples])
+            def _pad_wav(wav):
+                return F.pad(wav, (0, max_len - wav.shape[-1]))
+        if self.return_info:
+            if len(samples) > 0:
+                assert len(samples[0]) == 2
+                assert isinstance(samples[0][0], torch.Tensor)
+                assert isinstance(samples[0][1], SegmentInfo)
+            wavs = [wav for wav, _ in samples]
+            segment_infos = [copy.deepcopy(info) for _, info in samples]
+            if to_pad:
+                # Each wav could be of a different duration as they are not segmented.
+                for i in range(len(samples)):
+                    # Determines the total length of the signal with padding, so we update here as we pad.
+                    segment_infos[i].total_frames = max_len
+                    wavs[i] = _pad_wav(wavs[i])
+            wav = torch.stack(wavs)
+            return wav, segment_infos
+        else:
+            assert isinstance(samples[0], torch.Tensor)
+            if to_pad:
+                samples = [_pad_wav(s) for s in samples]
+            return torch.stack(samples)
+    def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
+        """Filters out audio files with audio durations that will not allow to sample examples from them."""
+        orig_len = len(meta)
+        # Filter data that is too short.
+        if self.min_audio_duration is not None:
+            meta = [m for m in meta if m.duration >= self.min_audio_duration]
+        # Filter data that is too long.
+        if self.max_audio_duration is not None:
+            meta = [m for m in meta if m.duration <= self.max_audio_duration]
+        filtered_len = len(meta)
+        removed_percentage = 100*(1-float(filtered_len)/orig_len)
+        msg = 'Removed %.2f percent of the data because it was too short or too long.' % removed_percentage
+        if removed_percentage < 10:
+            logging.debug(msg)
+        else:
+            logging.warning(msg)
+        return meta
+    @classmethod
+    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+        """Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        """
+        root = Path(root)
+        if root.is_dir():
+            if (root / 'data.jsonl').exists():
+                root = root / 'data.jsonl'
+            elif (root / 'data.jsonl.gz').exists():
+                root = root / 'data.jsonl.gz'
+            else:
+                raise ValueError("Don't know where to read metadata from in the dir. "
+                                 "Expecting either a data.jsonl or data.jsonl.gz file but none found.")
+        meta = load_audio_meta(root)
+        return cls(meta, **kwargs)
+    @classmethod
+    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+        """Instantiate AudioDataset from a path containing (possibly nested) audio files.
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            minimal_meta (bool): Whether to only load minimal metadata or not.
+            exts (list of str): Extensions for audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        """
+        root = Path(root)
+        if root.is_file():
+            meta = load_audio_meta(root, resolve=True)
+        else:
+            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+        return cls(meta, **kwargs)
+def main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog='audio_dataset',
+        description='Generate .jsonl files by scanning a folder.')
+    parser.add_argument('root', help='Root folder with all the audio files')
+    parser.add_argument('output_meta_file',
+                        help='Output file to store the metadata, ')
+    parser.add_argument('--complete',
+                        action='store_false', dest='minimal', default=True,
+                        help='Retrieve all metadata, even the one that are expansive '
+                             'to compute (e.g. normalization).')
+    parser.add_argument('--resolve',
+                        action='store_true', default=False,
+                        help='Resolve the paths to be absolute and with no symlinks.')
+    parser.add_argument('--workers',
+                        default=10, type=int,
+                        help='Number of workers.')
+    args = parser.parse_args()
+    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
+                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
+    save_audio_meta(args.output_meta_file, meta)
+if __name__ == '__main__':
+    main()

audiocraft/data/audio_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization."""
+import sys
+import typing as tp
+import julius
+import torch
+import torchaudio
+def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor:
+    """Convert audio to the given number of channels.
+    Args:
+        wav (torch.Tensor): Audio wave of shape [B, C, T].
+        channels (int): Expected number of channels as output.
+    Returns:
+        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
+    """
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, and the stream has multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file has
+        # a single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels >= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file has
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError('The audio file has less channels than requested but is not mono.')
+    return wav
+def convert_audio(wav: torch.Tensor, from_rate: float,
+                  to_rate: float, to_channels: int) -> torch.Tensor:
+    """Convert audio to new sample rate and number of audio channels."""
+    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
+    wav = convert_audio_channels(wav, to_channels)
+    return wav
+def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    """Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        torch.Tensor: Loudness normalized output data.
+    """
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy < energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output
+def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -> None:
+    """Utility function to clip the audio with logging if specified."""
+    max_scale = wav.abs().max()
+    if log_clipping and max_scale > 1:
+        clamp_prob = (wav.abs() > 1).float().mean().item()
+        print(f"CLIPPING {stem_name or ''} happening with proba (a bit of clipping is okay):",
+              clamp_prob, "maximum scale: ", max_scale.item(), file=sys.stderr)
+    #wav.clamp_(-1, 1)
+    wav = wav.clone().clamp_(-1, 1)
+def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = 'peak', peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -> torch.Tensor:
+    """Normalize the audio according to the prescribed strategy (see after).
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. 'clip' just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
+        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for 'rms').
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (str, optional): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    """
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == 'peak':
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling < 1:
+            wav = wav * rescaling
+    elif strategy == 'clip':
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == 'rms':
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling < 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == 'loudness':
+        assert sample_rate is not None, "Loudness normalization requires sample rate."
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() < 1
+        assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
+    return wav
+def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
+    """Convert audio to float 32 bits PCM format.
+    """
+    if wav.dtype.is_floating_point:
+        return wav
+    elif wav.dtype == torch.int16:
+        return wav.float() / 2**15
+    elif wav.dtype == torch.int32:
+        return wav.float() / 2**31
+    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
+def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
+    """Convert audio to int 16 bits PCM format.
+    ..Warning:: There exist many formula for doing this conversion. None are perfect
+    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    """
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() <= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() >= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav

audiocraft/data/info_audio_dataset.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc.
+"""
+from dataclasses import dataclass
+import logging
+import math
+import re
+import typing as tp
+import torch
+from .audio_dataset import AudioDataset, AudioMeta
+from ..environment import AudioCraftEnvironment
+from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
+logger = logging.getLogger(__name__)
+def _clusterify_meta(meta: AudioMeta) -> AudioMeta:
+    """Monkey-patch meta to match cluster specificities."""
+    meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
+    if meta.info_path is not None:
+        meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
+    return meta
+def clusterify_all_meta(meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
+    """Monkey-patch all meta to match cluster specificities."""
+    return [_clusterify_meta(m) for m in meta]
+@dataclass
+class AudioInfo(SegmentWithAttributes):
+    """Dummy SegmentInfo with empty attributes.
+    The InfoAudioDataset is expected to return metadata that inherits
+    from SegmentWithAttributes class and can return conditioning attributes.
+    This basically guarantees all datasets will be compatible with current
+    solver that contain conditioners requiring this.
+    """
+    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        return ConditioningAttributes()
+class InfoAudioDataset(AudioDataset):
+    """AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
+    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
+    """
+    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
+        super().__init__(clusterify_all_meta(meta), **kwargs)
+    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
+        if not self.return_info:
+            wav = super().__getitem__(index)
+            assert isinstance(wav, torch.Tensor)
+            return wav
+        wav, meta = super().__getitem__(index)
+        return wav, AudioInfo(**meta.to_dict())
+def get_keyword_or_keyword_list(value: tp.Optional[str]) -> tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
+    """Preprocess a single keyword or possible a list of keywords."""
+    if isinstance(value, list):
+        return get_keyword_list(value)
+    else:
+        return get_keyword(value)
+def get_string(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess a single keyword."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    else:
+        return value.strip()
+def get_keyword(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess a single keyword."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    else:
+        return value.strip().lower()
+def get_keyword_list(values: tp.Union[str, tp.List[str]]) -> tp.Optional[tp.List[str]]:
+    """Preprocess a list of keywords."""
+    if isinstance(values, str):
+        values = [v.strip() for v in re.split(r'[,\s]', values)]
+    elif isinstance(values, float) and math.isnan(values):
+        values = []
+    if not isinstance(values, list):
+        logger.debug(f"Unexpected keyword list {values}")
+        values = [str(values)]
+    kws = [get_keyword(v) for v in values]
+    kw_list = [k for k in kws if k is not None]
+    if len(kw_list) == 0:
+        return None
+    else:
+        return kw_list

audiocraft/data/music_dataset.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dataset of music tracks with rich metadata.
+"""
+from dataclasses import dataclass, field, fields, replace
+import gzip
+import json
+import logging
+from pathlib import Path
+import random
+import typing as tp
+import torch
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    AudioInfo,
+    get_keyword_list,
+    get_keyword,
+    get_string
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    JointEmbedCondition,
+    WavCondition,
+)
+from ..utils.utils import warn_once
+logger = logging.getLogger(__name__)
+@dataclass
+class MusicInfo(AudioInfo):
+    """Segment info augmented with music metadata.
+    """
+    # music-specific metadata
+    title: tp.Optional[str] = None
+    artist: tp.Optional[str] = None  # anonymized artist id, used to ensure no overlap between splits
+    key: tp.Optional[str] = None
+    bpm: tp.Optional[float] = None
+    genre: tp.Optional[str] = None
+    moods: tp.Optional[list] = None
+    keywords: tp.Optional[list] = None
+    description: tp.Optional[str] = None
+    name: tp.Optional[str] = None
+    instrument: tp.Optional[str] = None
+    # original wav accompanying the metadata
+    self_wav: tp.Optional[WavCondition] = None
+    # dict mapping attributes names to tuple of wav, text and metadata
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+    @property
+    def has_music_meta(self) -> bool:
+        return self.name is not None
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == 'self_wav':
+                out.wav[key] = value
+            elif key == 'joint_embed':
+                for embed_attribute, embed_cond in value.items():
+                    out.joint_embed[embed_attribute] = embed_cond
+            else:
+                if isinstance(value, list):
+                    value = ' '.join(value)
+                out.text[key] = value
+        return out
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == 'bpm':
+            preprocess_func = get_bpm
+        elif attribute == 'key':
+            preprocess_func = get_musical_key
+        elif attribute in ['moods', 'keywords']:
+            preprocess_func = get_keyword_list
+        elif attribute in ['genre', 'name', 'instrument']:
+            preprocess_func = get_keyword
+        elif attribute in ['title', 'artist', 'description']:
+            preprocess_func = get_string
+        else:
+            preprocess_func = None
+        return preprocess_func
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = ['self_wav', 'joint_embed']
+        optional_fields = ['keywords']
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required and _field.name not in optional_fields:
+                    raise KeyError(f"Unexpected missing key: {_field.name}")
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
+                                   drop_desc_p: float = 0., drop_other_p: float = 0.) -> MusicInfo:
+    """Augment MusicInfo description with additional metadata fields and potential dropout.
+    Additional textual attributes are added given probability 'merge_text_conditions_p' and
+    the original textual description is dropped from the augmented description given probability drop_desc_p.
+    Args:
+        music_info (MusicInfo): The music metadata to augment.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+            If provided value is 0, then no merging is performed.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+            if provided value is 0, then no drop out is performed.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+    Returns:
+        MusicInfo: The MusicInfo with augmented textual description.
+    """
+    def is_valid_field(field_name: str, field_value: tp.Any) -> bool:
+        valid_field_name = field_name in ['key', 'bpm', 'genre', 'moods', 'instrument', 'keywords']
+        valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
+        keep_field = random.uniform(0, 1) < drop_other_p
+        return valid_field_name and valid_field_value and keep_field
+    def process_value(v: tp.Any) -> str:
+        if isinstance(v, (int, float, str)):
+            return str(v)
+        if isinstance(v, list):
+            return ", ".join(v)
+        else:
+            raise ValueError(f"Unknown type for text value! ({type(v), v})")
+    description = music_info.description
+    metadata_text = ""
+    if random.uniform(0, 1) < merge_text_p:
+        meta_pairs = [f'{_field.name}: {process_value(getattr(music_info, _field.name))}'
+                      for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
+        random.shuffle(meta_pairs)
+        metadata_text = ". ".join(meta_pairs)
+        description = description if not random.uniform(0, 1) < drop_desc_p else None
+        logger.debug(f"Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}")
+    if description is None:
+        description = metadata_text if len(metadata_text) > 1 else None
+    else:
+        description = ". ".join([description.rstrip('.'), metadata_text])
+    description = description.strip() if description else None
+    music_info = replace(music_info)
+    music_info.description = description
+    return music_info
+class Paraphraser:
+    def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
+        self.paraphrase_p = paraphrase_p
+        open_fn = gzip.open if str(paraphrase_source).lower().endswith('.gz') else open
+        with open_fn(paraphrase_source, 'rb') as f:  # type: ignore
+            self.paraphrase_source = json.loads(f.read())
+        logger.info(f"loaded paraphrasing source from: {paraphrase_source}")
+    def sample_paraphrase(self, audio_path: str, description: str):
+        if random.random() >= self.paraphrase_p:
+            return description
+        info_path = Path(audio_path).with_suffix('.json')
+        if info_path not in self.paraphrase_source:
+            warn_once(logger, f"{info_path} not in paraphrase source!")
+            return description
+        new_desc = random.choice(self.paraphrase_source[info_path])
+        logger.debug(f"{description} -> {new_desc}")
+        return new_desc
+class MusicDataset(InfoAudioDataset):
+    """Music dataset is an AudioDataset with music-related metadata.
+    Args:
+        info_fields_required (bool): Whether to enforce having required fields.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+        joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
+        paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
+            paraphrases for the description. The json should be a dict with keys are the
+            original info path (e.g. track_path.json) and each value is a list of possible
+            paraphrased.
+        paraphrase_p (float): probability of taking a paraphrase.
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    """
+    def __init__(self, *args, info_fields_required: bool = True,
+                 merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
+                 joint_embed_attributes: tp.List[str] = [],
+                 paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
+                 **kwargs):
+        kwargs['return_info'] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.merge_text_p = merge_text_p
+        self.drop_desc_p = drop_desc_p
+        self.drop_other_p = drop_other_p
+        self.joint_embed_attributes = joint_embed_attributes
+        self.paraphraser = None
+        if paraphrase_source is not None:
+            self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        music_info_path = Path(info.meta.path).with_suffix('.json')
+        if Path(music_info_path).exists():
+            with open(music_info_path, 'r') as json_file:
+                music_data = json.load(json_file)
+                music_data.update(info_data)
+                music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
+            if self.paraphraser is not None:
+                music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
+            if self.merge_text_p:
+                music_info = augment_music_info_description(
+                    music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
+        else:
+            music_info = MusicInfo.from_dict(info_data, fields_required=False)
+        music_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+        for att in self.joint_embed_attributes:
+            att_value = getattr(music_info, att)
+            joint_embed_cond = JointEmbedCondition(
+                wav[None], [att_value], torch.tensor([info.n_frames]),
+                sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+            music_info.joint_embed[att] = joint_embed_cond
+        return wav, music_info
+def get_musical_key(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess key keywords, discarding them if there are multiple key defined."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    elif ',' in value:
+        # For now, we discard when multiple keys are defined separated with comas
+        return None
+    else:
+        return value.strip().lower()
+def get_bpm(value: tp.Optional[str]) -> tp.Optional[float]:
+    """Preprocess to a float."""
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None

audiocraft/data/sound_dataset.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dataset of audio with a simple description.
+"""
+from dataclasses import dataclass, fields, replace
+import json
+from pathlib import Path
+import random
+import typing as tp
+import numpy as np
+import torch
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    get_keyword_or_keyword_list
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    SegmentWithAttributes,
+    WavCondition,
+)
+EPS = torch.finfo(torch.float32).eps
+TARGET_LEVEL_LOWER = -35
+TARGET_LEVEL_UPPER = -15
+@dataclass
+class SoundInfo(SegmentWithAttributes):
+    """Segment info augmented with Sound metadata.
+    """
+    description: tp.Optional[str] = None
+    self_wav: tp.Optional[torch.Tensor] = None
+    @property
+    def has_sound_meta(self) -> bool:
+        return self.description is not None
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == 'self_wav':
+                out.wav[key] = value
+            else:
+                out.text[key] = value
+        return out
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == 'description':
+            preprocess_func = get_keyword_or_keyword_list
+        else:
+            preprocess_func = None
+        return preprocess_func
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = ['self_wav']
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required:
+                    raise KeyError(f"Unexpected missing key: {_field.name}")
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+class SoundDataset(InfoAudioDataset):
+    """Sound audio dataset: Audio dataset with environmental sound-specific metadata.
+    Args:
+        info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
+        external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
+            The metadata files contained in this folder are expected to match the stem of the audio file with
+            a json extension.
+        aug_p (float): Probability of performing audio mixing augmentation on the batch.
+        mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
+        mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
+        mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
+        mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
+        kwargs: Additional arguments for AudioDataset.
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    """
+    def __init__(
+        self,
+        *args,
+        info_fields_required: bool = True,
+        external_metadata_source: tp.Optional[str] = None,
+        aug_p: float = 0.,
+        mix_p: float = 0.,
+        mix_snr_low: int = -5,
+        mix_snr_high: int = 5,
+        mix_min_overlap: float = 0.5,
+        **kwargs
+    ):
+        kwargs['return_info'] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.external_metadata_source = external_metadata_source
+        self.aug_p = aug_p
+        self.mix_p = mix_p
+        if self.aug_p > 0:
+            assert self.mix_p > 0, "Expecting some mixing proportion mix_p if aug_p > 0"
+            assert self.channels == 1, "SoundDataset with audio mixing considers only monophonic audio"
+        self.mix_snr_low = mix_snr_low
+        self.mix_snr_high = mix_snr_high
+        self.mix_min_overlap = mix_min_overlap
+    def _get_info_path(self, path: tp.Union[str, Path]) -> Path:
+        """Get path of JSON with metadata (description, etc.).
+        If there exists a JSON with the same name as 'path.name', then it will be used.
+        Else, such JSON will be searched for in an external json source folder if it exists.
+        """
+        info_path = Path(path).with_suffix('.json')
+        if Path(info_path).exists():
+            return info_path
+        elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
+            return Path(self.external_metadata_source) / info_path.name
+        else:
+            raise Exception(f"Unable to find a metadata JSON for path: {path}")
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        info_path = self._get_info_path(info.meta.path)
+        if Path(info_path).exists():
+            with open(info_path, 'r') as json_file:
+                sound_data = json.load(json_file)
+                sound_data.update(info_data)
+                sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
+                # if there are multiple descriptions, sample one randomly
+                if isinstance(sound_info.description, list):
+                    sound_info.description = random.choice(sound_info.description)
+        else:
+            sound_info = SoundInfo.from_dict(info_data, fields_required=False)
+        sound_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+        return wav, sound_info
+    def collater(self, samples):
+        # when training, audio mixing is performed in the collate function
+        wav, sound_info = super().collater(samples)  # SoundDataset always returns infos
+        if self.aug_p > 0:
+            wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
+                                          snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
+                                          min_overlap=self.mix_min_overlap)
+        return wav, sound_info
+def rms_f(x: torch.Tensor) -> torch.Tensor:
+    return (x ** 2).mean(1).pow(0.5)
+def normalize(audio: torch.Tensor, target_level: int = -25) -> torch.Tensor:
+    """Normalize the signal to the target level."""
+    rms = rms_f(audio)
+    scalar = 10 ** (target_level / 20) / (rms + EPS)
+    audio = audio * scalar.unsqueeze(1)
+    return audio
+def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -> torch.Tensor:
+    return (abs(audio) > clipping_threshold).any(1)
+def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -> torch.Tensor:
+    start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
+    remainder = src.shape[1] - start
+    if dst.shape[1] > remainder:
+        src[:, start:] = src[:, start:] + dst[:, :remainder]
+    else:
+        src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
+    return src
+def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
+              target_level: int = -25, clipping_threshold: float = 0.99) -> torch.Tensor:
+    """Function to mix clean speech and noise at various SNR levels.
+    Args:
+        clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
+        noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
+        snr (int): SNR level when mixing.
+        min_overlap (float): Minimum overlap between the two mixed sources.
+        target_level (int): Gain level in dB.
+        clipping_threshold (float): Threshold for clipping the audio.
+    Returns:
+        torch.Tensor: The mixed audio, of shape [B, T].
+    """
+    if clean.shape[1] > noise.shape[1]:
+        noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
+    else:
+        noise = noise[:, :clean.shape[1]]
+    # normalizing to -25 dB FS
+    clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
+    clean = normalize(clean, target_level)
+    rmsclean = rms_f(clean)
+    noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
+    noise = normalize(noise, target_level)
+    rmsnoise = rms_f(noise)
+    # set the noise level for a given SNR
+    noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
+    noisenewlevel = noise * noisescalar
+    # mix noise and clean speech
+    noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
+    # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
+    # there is a chance of clipping that might happen with very less probability, which is not a major issue.
+    noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
+    rmsnoisy = rms_f(noisyspeech)
+    scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
+    noisyspeech = noisyspeech * scalarnoisy
+    clean = clean * scalarnoisy
+    noisenewlevel = noisenewlevel * scalarnoisy
+    # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
+    clipped = is_clipped(noisyspeech)
+    if clipped.any():
+        noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
+        noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
+    return noisyspeech
+def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
+    if snr_low == snr_high:
+        snr = snr_low
+    else:
+        snr = np.random.randint(snr_low, snr_high)
+    mix = snr_mixer(src, dst, snr, min_overlap)
+    return mix
+def mix_text(src_text: str, dst_text: str):
+    """Mix text from different sources by concatenating them."""
+    if src_text == dst_text:
+        return src_text
+    return src_text + " " + dst_text
+def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
+                snr_low: int, snr_high: int, min_overlap: float):
+    """Mix samples within a batch, summing the waveforms and concatenating the text infos.
+    Args:
+        wavs (torch.Tensor): Audio tensors of shape [B, C, T].
+        infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
+        aug_p (float): Augmentation probability.
+        mix_p (float): Proportion of items in the batch to mix (and merge) together.
+        snr_low (int): Lowerbound for sampling SNR.
+        snr_high (int): Upperbound for sampling SNR.
+        min_overlap (float): Minimum overlap between mixed samples.
+    Returns:
+        tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
+            and mixed SoundInfo for the given batch.
+    """
+    # no mixing to perform within the batch
+    if mix_p == 0:
+        return wavs, infos
+    if random.uniform(0, 1) < aug_p:
+        # perform all augmentations on waveforms as [B, T]
+        # randomly picking pairs of audio to mix
+        assert wavs.size(1) == 1, f"Mix samples requires monophonic audio but C={wavs.size(1)}"
+        wavs = wavs.mean(dim=1, keepdim=False)
+        B, T = wavs.shape
+        k = int(mix_p * B)
+        mixed_sources_idx = torch.randperm(B)[:k]
+        mixed_targets_idx = torch.randperm(B)[:k]
+        aug_wavs = snr_mix(
+            wavs[mixed_sources_idx],
+            wavs[mixed_targets_idx],
+            snr_low,
+            snr_high,
+            min_overlap,
+        )
+        # mixing textual descriptions in metadata
+        descriptions = [info.description for info in infos]
+        aug_infos = []
+        for i, j in zip(mixed_sources_idx, mixed_targets_idx):
+            text = mix_text(descriptions[i], descriptions[j])
+            m = replace(infos[i])
+            m.description = text
+            aug_infos.append(m)
+        # back to [B, C, T]
+        aug_wavs = aug_wavs.unsqueeze(1)
+        assert aug_wavs.shape[0] > 0, "Samples mixing returned empty batch."
+        assert aug_wavs.dim() == 3, f"Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}"
+        assert aug_wavs.shape[0] == len(aug_infos), "Mismatch between number of wavs and infos in the batch"
+        return aug_wavs, aug_infos  # [B, C, T]
+    else:
+        # randomly pick samples in the batch to match
+        # the batch size when performing audio mixing
+        B, C, T = wavs.shape
+        k = int(mix_p * B)
+        wav_idx = torch.randperm(B)[:k]
+        wavs = wavs[wav_idx]
+        infos = [infos[i] for i in wav_idx]
+        assert wavs.shape[0] == len(infos), "Mismatch between number of wavs and infos in the batch"
+        return wavs, infos  # [B, C, T]

audiocraft/data/zip.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Utility for reading some info from inside a zip file.
+"""
+import typing
+import zipfile
+from dataclasses import dataclass
+from functools import lru_cache
+from typing_extensions import Literal
+DEFAULT_SIZE = 32
+MODE = Literal['r', 'w', 'x', 'a']
+@dataclass(order=True)
+class PathInZip:
+    """Hold a path of file within a zip file.
+    Args:
+        path (str): The convention is <path_to_zip>:<relative_path_inside_zip>.
+            Let's assume there is a zip file /some/location/foo.zip
+            and inside of it is a json file located at /data/file1.json,
+            Then we expect path = "/some/location/foo.zip:/data/file1.json".
+    """
+    INFO_PATH_SEP = ':'
+    zip_path: str
+    file_path: str
+    def __init__(self, path: str) -> None:
+        split_path = path.split(self.INFO_PATH_SEP)
+        assert len(split_path) == 2
+        self.zip_path, self.file_path = split_path
+    @classmethod
+    def from_paths(cls, zip_path: str, file_path: str):
+        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
+    def __str__(self) -> str:
+        return self.zip_path + self.INFO_PATH_SEP + self.file_path
+def _open_zip(path: str, mode: MODE = 'r'):
+    return zipfile.ZipFile(path, mode)
+_cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
+def set_zip_cache_size(max_size: int):
+    """Sets the maximal LRU caching for zip file opening.
+    Args:
+        max_size (int): the maximal LRU cache.
+    """
+    global _cached_open_zip
+    _cached_open_zip = lru_cache(max_size)(_open_zip)
+def open_file_in_zip(path_in_zip: PathInZip, mode: str = 'r') -> typing.IO:
+    """Opens a file stored inside a zip and returns a file-like object.
+    Args:
+        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
+        mode (str): The mode in which to open the file with.
+    Returns:
+        A file-like object for PathInZip.
+    """
+    zf = _cached_open_zip(path_in_zip.zip_path)
+    return zf.open(path_in_zip.file_path)

audiocraft/environment.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Provides cluster and tools configuration across clusters (slurm, dora, utilities).
+"""
+import logging
+import os
+from pathlib import Path
+import re
+import typing as tp
+import omegaconf
+from .utils.cluster import _guess_cluster_type
+logger = logging.getLogger(__name__)
+class AudioCraftEnvironment:
+    """Environment configuration for teams and clusters.
+    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
+    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
+    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
+    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
+    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
+    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
+    Use the following environment variables to specify the cluster, team or configuration:
+        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
+            cannot be inferred automatically.
+        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
+            If not set, configuration is read from config/teams.yaml.
+        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
+            Cluster configuration are shared across teams to match compute allocation,
+            specify your cluster configuration in the configuration file under a key mapping
+            your team name.
+    """
+    _instance = None
+    DEFAULT_TEAM = "default"
+    def __init__(self) -> None:
+        """Loads configuration."""
+        self.team: str = os.getenv("AUDIOCRAFT_TEAM", self.DEFAULT_TEAM)
+        cluster_type = _guess_cluster_type()
+        cluster = os.getenv(
+            "AUDIOCRAFT_CLUSTER", cluster_type.value
+        )
+        logger.info("Detecting cluster type %s", cluster_type)
+        self.cluster: str = cluster
+        config_path = os.getenv(
+            "AUDIOCRAFT_CONFIG",
+            Path(__file__)
+            .parent.parent.joinpath("config/teams", self.team)
+            .with_suffix(".yaml"),
+        )
+        self.config = omegaconf.OmegaConf.load(config_path)
+        self._dataset_mappers = []
+        cluster_config = self._get_cluster_config()
+        if "dataset_mappers" in cluster_config:
+            for pattern, repl in cluster_config["dataset_mappers"].items():
+                regex = re.compile(pattern)
+                self._dataset_mappers.append((regex, repl))
+    def _get_cluster_config(self) -> omegaconf.DictConfig:
+        assert isinstance(self.config, omegaconf.DictConfig)
+        return self.config[self.cluster]
+    @classmethod
+    def instance(cls):
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    @classmethod
+    def reset(cls):
+        """Clears the environment and forces a reload on next invocation."""
+        cls._instance = None
+    @classmethod
+    def get_team(cls) -> str:
+        """Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+        If not defined, defaults to "labs".
+        """
+        return cls.instance().team
+    @classmethod
+    def get_cluster(cls) -> str:
+        """Gets the detected cluster.
+        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
+        """
+        return cls.instance().cluster
+    @classmethod
+    def get_dora_dir(cls) -> Path:
+        """Gets the path to the dora directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
+        """
+        cluster_config = cls.instance()._get_cluster_config()
+        dora_dir = os.getenv("AUDIOCRAFT_DORA_DIR", cluster_config["dora_dir"])
+        logger.warning(f"Dora directory: {dora_dir}")
+        return Path(dora_dir)
+    @classmethod
+    def get_reference_dir(cls) -> Path:
+        """Gets the path to the reference directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
+        """
+        cluster_config = cls.instance()._get_cluster_config()
+        return Path(os.getenv("AUDIOCRAFT_REFERENCE_DIR", cluster_config["reference_dir"]))
+    @classmethod
+    def get_slurm_exclude(cls) -> tp.Optional[str]:
+        """Get the list of nodes to exclude for that cluster."""
+        cluster_config = cls.instance()._get_cluster_config()
+        return cluster_config.get("slurm_exclude")
+    @classmethod
+    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -> str:
+        """Gets the requested partitions for the current team and cluster as a comma-separated string.
+        Args:
+            partition_types (list[str], optional): partition types to retrieve. Values must be
+                from ['global', 'team']. If not provided, the global partition is returned.
+        """
+        if not partition_types:
+            partition_types = ["global"]
+        cluster_config = cls.instance()._get_cluster_config()
+        partitions = [
+            cluster_config["partitions"][partition_type]
+            for partition_type in partition_types
+        ]
+        return ",".join(partitions)
+    @classmethod
+    def resolve_reference_path(cls, path: tp.Union[str, Path]) -> Path:
+        """Converts reference placeholder in path with configured reference dir to resolve paths.
+        Args:
+            path (str or Path): Path to resolve.
+        Returns:
+            Path: Resolved path.
+        """
+        path = str(path)
+        if path.startswith("//reference"):
+            reference_dir = cls.get_reference_dir()
+            logger.warn(f"Reference directory: {reference_dir}")
+            assert (
+                reference_dir.exists() and reference_dir.is_dir()
+            ), f"Reference directory does not exist: {reference_dir}."
+            path = re.sub("^//reference", str(reference_dir), path)
+        return Path(path)
+    @classmethod
+    def apply_dataset_mappers(cls, path: str) -> str:
+        """Applies dataset mapping regex rules as defined in the configuration.
+        If no rules are defined, the path is returned as-is.
+        """
+        instance = cls.instance()
+        for pattern, repl in instance._dataset_mappers:
+            path = pattern.sub(repl, path)
+        return path

audiocraft/grids/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dora Grids."""

audiocraft/grids/_base_explorers.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import time
+import typing as tp
+from dora import Explorer
+import treetable as tt
+def get_sheep_ping(sheep) -> tp.Optional[str]:
+    """Return the amount of time since the Sheep made some update
+    to its log. Returns a str using the relevant time unit."""
+    ping = None
+    if sheep.log is not None and sheep.log.exists():
+        delta = time.time() - sheep.log.stat().st_mtime
+        if delta > 3600 * 24:
+            ping = f'{delta / (3600 * 24):.1f}d'
+        elif delta > 3600:
+            ping = f'{delta / (3600):.1f}h'
+        elif delta > 60:
+            ping = f'{delta / 60:.1f}m'
+        else:
+            ping = f'{delta:.1f}s'
+    return ping
+class BaseExplorer(ABC, Explorer):
+    """Base explorer for AudioCraft grids.
+    All task specific solvers are expected to implement the `get_grid_metrics`
+    method to specify logic about metrics to display for a given task.
+    If additional stages are used, the child explorer must define how to handle
+    these new stages in the `process_history` and `process_sheep` methods.
+    """
+    def stages(self):
+        return ["train", "valid", "evaluate"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+            tt.leaf("sid", align="<"),
+        ]
+    @abstractmethod
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        ...
+    def process_sheep(self, sheep, history):
+        train = {
+            "epoch": len(history),
+        }
+        parts = {"train": train}
+        for metrics in history:
+            for key, sub in metrics.items():
+                part = parts.get(key, {})
+                if 'duration' in sub:
+                    # Convert to minutes for readability.
+                    sub['duration'] = sub['duration'] / 60.
+                part.update(sub)
+                parts[key] = part
+        ping = get_sheep_ping(sheep)
+        if ping is not None:
+            for name in self.stages():
+                if name not in parts:
+                    parts[name] = {}
+                # Add the ping to each part for convenience.
+                parts[name]['ping'] = ping
+        return parts

audiocraft/grids/audiogen/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""AudioGen grids."""

audiocraft/grids/audiogen/audiogen_base_16khz.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ..musicgen._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=64, partition=partitions)
+    launcher.bind_(solver='audiogen/audiogen_base_16khz')
+    # replace this by the desired environmental sound dataset
+    launcher.bind_(dset='internal/sounds_16khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    launcher.bind_(fsdp)
+    launcher(medium)

audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+When running the grid for the first time, please use:
+REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+"""
+import os
+from ..musicgen._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+def eval(launcher, batch_size: int = 32):
+    opts = {
+        'dset': 'audio/audiocaps_16khz',
+        'solver/audiogen/evaluation': 'objective_eval',
+        'execute_only': 'evaluate',
+        '+dataset.evaluate.batch_size': batch_size,
+        '+metrics.fad.tf.batch_size': 32,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
+    }
+    opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
+    opt2 = {'transformer_lm.two_step_cfg': True}
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+    # base objective metrics
+    sub(opt1, opt2)
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=4, partition=partitions)
+    if 'REGEN' not in os.environ:
+        folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+    audiogen_base = launcher.bind(solver="audiogen/audiogen_base_16khz")
+    audiogen_base.bind_({'autocast': False, 'fsdp.use': True})
+    audiogen_base_medium = audiogen_base.bind({'continue_from': '//pretrained/facebook/audiogen-medium'})
+    audiogen_base_medium.bind_({'model/lm/model_scale': 'medium'})
+    eval(audiogen_base_medium, batch_size=128)

audiocraft/grids/compression/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""EnCodec grids."""

audiocraft/grids/compression/_explorers.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class CompressionExplorer(BaseExplorer):
+    eval_metrics = ["sisnr", "visqol"]
+    def stages(self):
+        return ["train", "valid", "evaluate"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+        ]
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        return [
+            tt.group(
+                "train",
+                [
+                    tt.leaf("epoch"),
+                    tt.leaf("bandwidth", ".2f"),
+                    tt.leaf("adv", ".4f"),
+                    tt.leaf("d_loss", ".4f"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid",
+                [
+                    tt.leaf("bandwidth", ".2f"),
+                    tt.leaf("adv", ".4f"),
+                    tt.leaf("msspec", ".4f"),
+                    tt.leaf("sisnr", ".2f"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "evaluate", [tt.leaf(name, ".3f") for name in self.eval_metrics], align=">"
+            ),
+        ]

audiocraft/grids/compression/debug.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid is a minimal example for debugging compression task
+and how to override parameters directly in a grid.
+Learn more about dora grids: https://github.com/facebookresearch/dora
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=2, partition=partitions)
+    launcher.bind_(solver='compression/debug')
+    with launcher.job_array():
+        # base debug task using config from solver=compression/debug
+        launcher()
+        # we can override parameters in the grid to launch additional xps
+        launcher({'rvq.bins': 2048, 'rvq.n_q': 4})

audiocraft/grids/compression/encodec_audiogen_16khz.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train the new AudioGen EnCodec model at 16 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for AudioGen's EnCodec model trained on monophonic audio sampled at 16 kHz
+    # AudioGen's EnCodec is trained with a total stride of 320 leading to a frame rate of 50 hz
+    launcher.bind_(solver='compression/encodec_audiogen_16khz')
+    # replace this by the desired sound dataset
+    launcher.bind_(dset='internal/sounds_16khz')
+    # launch xp
+    launcher()

audiocraft/grids/compression/encodec_base_24khz.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train a base causal EnCodec model at 24 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # base causal EnCodec trained on monophonic audio sampled at 24 kHz
+    launcher.bind_(solver='compression/encodec_base_24khz')
+    # replace this by the desired dataset
+    launcher.bind_(dset='audio/example')
+    # launch xp
+    launcher()

audiocraft/grids/compression/encodec_musicgen_32khz.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train a MusicGen EnCodec model at 32 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for MusicGen's EnCodec model trained on monophonic audio sampled at 32 kHz
+    # MusicGen's EnCodec is trained with a total stride of 640 leading to a frame rate of 50 hz
+    launcher.bind_(solver='compression/encodec_musicgen_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    # launch xp
+    launcher()
+    launcher({
+        'metrics.visqol.bin': '/data/home/jadecopet/local/usr/opt/visqol',
+        'label': 'visqol',
+        'evaluate.metrics.visqol': True
+    })

audiocraft/grids/diffusion/4_bands_base_32khz.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Training of the 4 diffusion models described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).
+"""
+from ._explorers import DiffusionExplorer
+@DiffusionExplorer
+def explorer(launcher):
+    launcher.slurm_(gpus=4, partition='learnfair')
+    launcher.bind_({'solver': 'diffusion/default',
+                    'dset': 'internal/music_10k_32khz'})
+    with launcher.job_array():
+        launcher({'filter.use': True, 'filter.idx_band': 0, "processor.use": False, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 1, "processor.use": False, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 2, "processor.use": True, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 3, "processor.use": True, 'processor.power_std': 0.75})

audiocraft/grids/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Diffusion grids."""

audiocraft/grids/diffusion/_explorers.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class DiffusionExplorer(BaseExplorer):
+    eval_metrics = ["sisnr", "visqol"]
+    def stages(self):
+        return ["train", "valid", "valid_ema", "evaluate", "evaluate_ema"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+        ]
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        return [
+            tt.group(
+                "train",
+                [
+                    tt.leaf("epoch"),
+                    tt.leaf("loss", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid",
+                [
+                    tt.leaf("loss", ".3%"),
+                    # tt.leaf("loss_0", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid_ema",
+                [
+                    tt.leaf("loss", ".3%"),
+                    # tt.leaf("loss_0", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "evaluate", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
+                             tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
+                             tt.leaf("rvm_3", ".4f"), ], align=">"
+            ),
+            tt.group(
+                "evaluate_ema", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
+                                 tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
+                                 tt.leaf("rvm_3", ".4f")], align=">"
+            ),
+        ]

audiocraft/grids/musicgen/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""MusicGen grids."""

audiocraft/grids/musicgen/_explorers.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class LMExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+    def stages(self) -> tp.List[str]:
+        return ['train', 'valid']
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'train',
+                [
+                    tt.leaf('epoch'),
+                    tt.leaf('duration', '.1f'),  # duration in minutes
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),  # cross entropy
+                    tt.leaf("ppl", '.3f'),  # perplexity
+                ],
+                align='>',
+            ),
+            tt.group(
+                'valid',
+                [
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('best_ppl', '.3f'),
+                ],
+                align='>',
+            ),
+        ]
+    def process_sheep(self, sheep, history):
+        parts = super().process_sheep(sheep, history)
+        track_by = {'ppl': 'lower'}  # values should be in ['lower', 'higher']
+        best_metrics = {k: (1 if v == 'lower' else -1) * float('inf') for k, v in track_by.items()}
+        def comparator(mode, a, b):
+            return a < b if mode == 'lower' else a > b
+        for metrics in history:
+            for key, sub in metrics.items():
+                for metric in track_by:
+                    # for the validation set, keep track of best metrics (ppl in this example)
+                    # this is so we can conveniently compare metrics between runs in the grid
+                    if key == 'valid' and metric in sub and comparator(
+                        track_by[metric], sub[metric], best_metrics[metric]
+                    ):
+                        best_metrics[metric] = sub[metric]
+        if 'valid' in parts:
+            parts['valid'].update({f'best_{k}': v for k, v in best_metrics.items()})
+        return parts
+class GenerationEvalExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+    def stages(self) -> tp.List[str]:
+        return ['evaluate']
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'evaluate',
+                [
+                    tt.leaf('epoch', '.3f'),
+                    tt.leaf('duration', '.1f'),
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('fad', '.3f'),
+                    tt.leaf('kld', '.3f'),
+                    tt.leaf('text_consistency', '.3f'),
+                    tt.leaf('chroma_cosine', '.3f'),
+                ],
+                align='>',
+            ),
+        ]

audiocraft/grids/musicgen/musicgen_base_32khz.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_base_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    large = {'model/lm/model_scale': 'large'}
+    cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
+    wd_low = {'conditioners.description.t5.word_dropout': 0.2}
+    adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
+    launcher.bind_(fsdp)
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+    launcher.slurm_(gpus=64).bind_(label='64gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+    launcher.slurm_(gpus=96).bind_(label='96gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})