Spaces:

cchaun
/

music_tagging

Build error

App Files Files Community

cchaun commited on Aug 22, 2022

Commit

0d6426a

1 Parent(s): f25ccdb

add project files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
.gitignore +3 -0
app.py +104 -0
models/attention_modules.py +263 -0
models/best_model.pth +3 -0
models/model.py +622 -0
models/modules.py +271 -0
requirements.txt +5 -0
samples/flute.wav +3 -0
samples/guitar_acoustic.wav +3 -0
samples/guitar_electric.wav +3 -0
samples/piano.wav +3 -0
samples/violin.wav +3 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+__pycache__
+flagged

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# -*- coding: UTF-8 -*-
+import gradio as gr
+import torch, torchaudio
+from timeit import default_timer as timer
+from torchaudio.transforms import Resample
+from models.model import HarmonicCNN
+device = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 16000
+AUDIO_LEN = 2.90
+model = HarmonicCNN()
+S = torch.load('models/best_model.pth')
+model.load_state_dict(S)
+LABELS = [
+    "alternative",
+    "ambient",
+    "atmospheric",
+    "chillout",
+    "classical",
+    "dance",
+    "downtempo",
+    "easylistening",
+    "electronic",
+    "experimental",
+    "folk",
+    "funk",
+    "hiphop",
+    "house",
+    "indie",
+    "instrumentalpop",
+    "jazz",
+    "lounge",
+    "metal",
+    "newage",
+    "orchestral",
+    "pop",
+    "popfolk",
+    "poprock",
+    "reggae",
+    "rock",
+    "soundtrack",
+    "techno",
+    "trance",
+    "triphop",
+    "world",
+    "acousticguitar",
+    "bass",
+    "computer",
+    "drummachine",
+    "drums",
+    "electricguitar",
+    "electricpiano",
+    "guitar",
+    "keyboard",
+    "piano",
+    "strings",
+    "synthesizer",
+    "violin",
+    "voice",
+    "emotional",
+    "energetic",
+    "film",
+    "happy",
+    "relaxing"
+]
+example_list = [
+    "samples/guitar_acoustic.wav",
+    "samples/guitar_electric.wav",
+    "samples/piano.wav",
+    "samples/violin.wav",
+    "samples/flute.wav"
+]
+def predict(audio_path):
+    start_time = timer()
+    wav, sample_rate = torchaudio.load(audio_path)
+    if sample_rate > SAMPLE_RATE:
+        resampler = Resample(sample_rate, SAMPLE_RATE)
+        wav = resampler(wav)
+    if wav.shape[0] >= 2:
+        wav = torch.mean(wav, dim=0)
+        wav = wav.unsqueeze(0)
+    model.eval()
+    with torch.inference_mode():
+        pred_probs = model(wav)
+    pred_labels_and_probs = {LABELS[i]: float(pred_probs[0][i]) for i in range(len(LABELS))}
+    pred_time = round(timer() - start_time, 5)
+    return pred_labels_and_probs, pred_time
+title = "Music Tagging"
+demo = gr.Interface(fn=predict,
+                    inputs=gr.Audio(type="filepath"),
+                    outputs=[gr.Label(num_top_classes=10, label="Predictions"),
+                             gr.Number(label="Prediction time (s)")],
+                    examples=example_list,
+                    title=title)
+demo.launch(debug=False)

models/attention_modules.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# coding: utf-8
+# Code adopted from https://github.com/huggingface/pytorch-pretrained-BERT
+import math
+import copy
+import torch
+import torch.nn as nn
+import numpy as np
+# Gelu
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+# LayerNorm
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+#print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+class BertConfig(object):
+    def __init__(self,
+                 vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 attention_probs_dropout_prob=0.1,
+                 type_vocab_size=2):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_vocab_size = type_vocab_size
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = gelu
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask=None, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids[:, :, 0])
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = input_ids + position_embeddings
+        #embeddings = input_ids
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class PositionalEncoding(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoding, self).__init__()
+        emb_dim = config.hidden_size
+        max_len = config.max_position_embeddings
+        self.position_enc = self.position_encoding_init(max_len, emb_dim)
+    @staticmethod
+    def position_encoding_init(n_position, emb_dim):
+        ''' Init the sinusoid position encoding table '''
+        # keep dim 0 for padding token position encoding zero vector
+        position_enc = np.array([
+            [pos / np.power(10000, 2 * (j // 2) / emb_dim) for j in range(emb_dim)]
+            if pos != 0 else np.zeros(emb_dim) for pos in range(n_position)])
+        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # apply sin on 0th,2nd,4th...emb_dim
+        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # apply cos on 1st,3rd,5th...emb_dim
+        return torch.from_numpy(position_enc).type(torch.FloatTensor)
+    def forward(self, word_seq):
+        position_encoding = self.position_enc.unsqueeze(0).expand_as(word_seq)
+        position_encoding = position_encoding.to(word_seq.device)
+        word_pos_encoded = word_seq + position_encoding
+        return word_pos_encoded
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output

models/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0920da2535e92791f5123a59216a3daa0b7c7e9a21873827551a597ba11648a7
+size 14563900

models/model.py ADDED Viewed

	@@ -0,0 +1,622 @@

+# coding: utf-8
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import torchaudio
+from models.modules import Conv_1d, ResSE_1d, Conv_2d, Res_2d, Conv_V, Conv_H, HarmonicSTFT, Res_2d_mp
+from models.attention_modules import BertConfig, BertEncoder, BertEmbeddings, BertPooler, PositionalEncoding
+class FCN(nn.Module):
+    '''
+    Choi et al. 2016
+    Automatic tagging using deep convolutional neural networks.
+    Fully convolutional network.
+    '''
+    def __init__(self,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=96,
+                n_class=50):
+        super(FCN, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # FCN
+        self.layer1 = Conv_2d(1, 64, pooling=(2,4))
+        self.layer2 = Conv_2d(64, 128, pooling=(2,4))
+        self.layer3 = Conv_2d(128, 128, pooling=(2,4))
+        self.layer4 = Conv_2d(128, 128, pooling=(3,5))
+        self.layer5 = Conv_2d(128, 64, pooling=(4,4))
+        # Dense
+        self.dense = nn.Linear(64, n_class)
+        self.dropout = nn.Dropout(0.5)
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # FCN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        # Dense
+        x = x.view(x.size(0), -1)
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = nn.Sigmoid()(x)
+        return x
+class Musicnn(nn.Module):
+    '''
+    Pons et al. 2017
+    End-to-end learning for music audio tagging at scale.
+    This is the updated implementation of the original paper. Referred to the Musicnn code.
+    https://github.com/jordipons/musicnn
+    '''
+    def __init__(self,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=96,
+                n_class=50,
+                dataset='mtat'):
+        super(Musicnn, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # Pons front-end
+        m1 = Conv_V(1, 204, (int(0.7*96), 7))
+        m2 = Conv_V(1, 204, (int(0.4*96), 7))
+        m3 = Conv_H(1, 51, 129)
+        m4 = Conv_H(1, 51, 65)
+        m5 = Conv_H(1, 51, 33)
+        self.layers = nn.ModuleList([m1, m2, m3, m4, m5])
+        # Pons back-end
+        backend_channel= 512 if dataset=='msd' else 64
+        self.layer1 = Conv_1d(561, backend_channel, 7, 1, 1)
+        self.layer2 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)
+        self.layer3 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)
+        # Dense
+        dense_channel = 500 if dataset=='msd' else 200
+        self.dense1 = nn.Linear((561+(backend_channel*3))*2, dense_channel)
+        self.bn = nn.BatchNorm1d(dense_channel)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.5)
+        self.dense2 = nn.Linear(dense_channel, n_class)
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # Pons front-end
+        out = []
+        for layer in self.layers:
+            out.append(layer(x))
+        out = torch.cat(out, dim=1)
+        # Pons back-end
+        length = out.size(2)
+        res1 = self.layer1(out)
+        res2 = self.layer2(res1) + res1
+        res3 = self.layer3(res2) + res2
+        out = torch.cat([out, res1, res2, res3], 1)
+        mp = nn.MaxPool1d(length)(out)
+        avgp = nn.AvgPool1d(length)(out)
+        out = torch.cat([mp, avgp], dim=1)
+        out = out.squeeze(2)
+        out = self.relu(self.bn(self.dense1(out)))
+        out = self.dropout(out)
+        out = self.dense2(out)
+        out = nn.Sigmoid()(out)
+        return out
+class CRNN(nn.Module):
+    '''
+    Choi et al. 2017
+    Convolution recurrent neural networks for music classification.
+    Feature extraction with CNN + temporal summary with RNN
+    '''
+    def __init__(self,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=96,
+                n_class=50):
+        super(CRNN, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # CNN
+        self.layer1 = Conv_2d(1, 64, pooling=(2,2))
+        self.layer2 = Conv_2d(64, 128, pooling=(3,3))
+        self.layer3 = Conv_2d(128, 128, pooling=(4,4))
+        self.layer4 = Conv_2d(128, 128, pooling=(4,4))
+        # RNN
+        self.layer5 = nn.GRU(128, 32, 2, batch_first=True)
+        # Dense
+        self.dropout = nn.Dropout(0.5)
+        self.dense = nn.Linear(32, 50)
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # CCN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        # RNN
+        x = x.squeeze(2)
+        x = x.permute(0, 2, 1)
+        x, _ = self.layer5(x)
+        x = x[:, -1, :]
+        # Dense
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = nn.Sigmoid()(x)
+        return x
+class SampleCNN(nn.Module):
+    '''
+    Lee et al. 2017
+    Sample-level deep convolutional neural networks for music auto-tagging using raw waveforms.
+    Sample-level CNN.
+    '''
+    def __init__(self,
+                 n_class=50):
+        super(SampleCNN, self).__init__()
+        self.layer1 = Conv_1d(1, 128, shape=3, stride=3, pooling=1)
+        self.layer2 = Conv_1d(128, 128, shape=3, stride=1, pooling=3)
+        self.layer3 = Conv_1d(128, 128, shape=3, stride=1, pooling=3)
+        self.layer4 = Conv_1d(128, 256, shape=3, stride=1, pooling=3)
+        self.layer5 = Conv_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer6 = Conv_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer7 = Conv_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer8 = Conv_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer9 = Conv_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer10 = Conv_1d(256, 512, shape=3, stride=1, pooling=3)
+        self.layer11 = Conv_1d(512, 512, shape=1, stride=1, pooling=1)
+        self.dropout = nn.Dropout(0.5)
+        self.dense = nn.Linear(512, n_class)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = self.layer8(x)
+        x = self.layer9(x)
+        x = self.layer10(x)
+        x = self.layer11(x)
+        x = x.squeeze(-1)
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = nn.Sigmoid()(x)
+        return x
+class SampleCNNSE(nn.Module):
+    '''
+    Kim et al. 2018
+    Sample-level CNN architectures for music auto-tagging using raw waveforms.
+    Sample-level CNN + residual connections + squeeze & excitation.
+    '''
+    def __init__(self,
+                 n_class=50):
+        super(SampleCNNSE, self).__init__()
+        self.layer1 = ResSE_1d(1, 128, shape=3, stride=3, pooling=1)
+        self.layer2 = ResSE_1d(128, 128, shape=3, stride=1, pooling=3)
+        self.layer3 = ResSE_1d(128, 128, shape=3, stride=1, pooling=3)
+        self.layer4 = ResSE_1d(128, 256, shape=3, stride=1, pooling=3)
+        self.layer5 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer6 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer7 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer8 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer9 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3)
+        self.layer10 = ResSE_1d(256, 512, shape=3, stride=1, pooling=3)
+        self.layer11 = ResSE_1d(512, 512, shape=1, stride=1, pooling=1)
+        self.dropout = nn.Dropout(0.5)
+        self.dense1 = nn.Linear(512, 512)
+        self.bn = nn.BatchNorm1d(512)
+        self.dense2 = nn.Linear(512, n_class)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = self.layer8(x)
+        x = self.layer9(x)
+        x = self.layer10(x)
+        x = self.layer11(x)
+        x = x.squeeze(-1)
+        x = nn.ReLU()(self.bn(self.dense1(x)))
+        x = self.dropout(x)
+        x = self.dense2(x)
+        x = nn.Sigmoid()(x)
+        return x
+class ShortChunkCNN(nn.Module):
+    '''
+    Short-chunk CNN architecture.
+    So-called vgg-ish model with a small receptive field.
+    Deeper layers, smaller pooling (2x2).
+    '''
+    def __init__(self,
+                n_channels=128,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=128,
+                n_class=50):
+        super(ShortChunkCNN, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # CNN
+        self.layer1 = Conv_2d(1, n_channels, pooling=2)
+        self.layer2 = Conv_2d(n_channels, n_channels, pooling=2)
+        self.layer3 = Conv_2d(n_channels, n_channels*2, pooling=2)
+        self.layer4 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
+        self.layer5 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
+        self.layer6 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
+        self.layer7 = Conv_2d(n_channels*2, n_channels*4, pooling=2)
+        # Dense
+        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
+        self.bn = nn.BatchNorm1d(n_channels*4)
+        self.dense2 = nn.Linear(n_channels*4, n_class)
+        self.dropout = nn.Dropout(0.5)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # CNN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = x.squeeze(2)
+        # Global Max Pooling
+        if x.size(-1) != 1:
+            x = nn.MaxPool1d(x.size(-1))(x)
+        x = x.squeeze(2)
+        # Dense
+        x = self.dense1(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.dense2(x)
+        x = nn.Sigmoid()(x)
+        return x
+class ShortChunkCNN_Res(nn.Module):
+    '''
+    Short-chunk CNN architecture with residual connections.
+    '''
+    def __init__(self,
+                n_channels=128,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=128,
+                n_class=50):
+        super(ShortChunkCNN_Res, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # CNN
+        self.layer1 = Res_2d(1, n_channels, stride=2)
+        self.layer2 = Res_2d(n_channels, n_channels, stride=2)
+        self.layer3 = Res_2d(n_channels, n_channels*2, stride=2)
+        self.layer4 = Res_2d(n_channels*2, n_channels*2, stride=2)
+        self.layer5 = Res_2d(n_channels*2, n_channels*2, stride=2)
+        self.layer6 = Res_2d(n_channels*2, n_channels*2, stride=2)
+        self.layer7 = Res_2d(n_channels*2, n_channels*4, stride=2)
+        # Dense
+        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
+        self.bn = nn.BatchNorm1d(n_channels*4)
+        self.dense2 = nn.Linear(n_channels*4, n_class)
+        self.dropout = nn.Dropout(0.5)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # CNN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = x.squeeze(2)
+        # Global Max Pooling
+        if x.size(-1) != 1:
+            x = nn.MaxPool1d(x.size(-1))(x)
+        x = x.squeeze(2)
+        # Dense
+        x = self.dense1(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.dense2(x)
+        x = nn.Sigmoid()(x)
+        return x
+class CNNSA(nn.Module):
+    '''
+    Won et al. 2019
+    Toward interpretable music tagging with self-attention.
+    Feature extraction with CNN + temporal summary with Transformer encoder.
+    '''
+    def __init__(self,
+                n_channels=128,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=128,
+                n_class=50):
+        super(CNNSA, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # CNN
+        self.layer1 = Res_2d(1, n_channels, stride=2)
+        self.layer2 = Res_2d(n_channels, n_channels, stride=2)
+        self.layer3 = Res_2d(n_channels, n_channels*2, stride=2)
+        self.layer4 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer5 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer6 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer7 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        # Transformer encoder
+        bert_config = BertConfig(vocab_size=256,
+                                 hidden_size=256,
+                                 num_hidden_layers=2,
+                                 num_attention_heads=8,
+                                 intermediate_size=1024,
+                                 hidden_act="gelu",
+                                 hidden_dropout_prob=0.4,
+                                 max_position_embeddings=700,
+                                 attention_probs_dropout_prob=0.5)
+        self.encoder = BertEncoder(bert_config)
+        self.pooler = BertPooler(bert_config)
+        self.vec_cls = self.get_cls(256)
+        # Dense
+        self.dropout = nn.Dropout(0.5)
+        self.dense = nn.Linear(256, n_class)
+    def get_cls(self, channel):
+        np.random.seed(0)
+        single_cls = torch.Tensor(np.random.random((1, channel)))
+        vec_cls = torch.cat([single_cls for _ in range(64)], dim=0)
+        vec_cls = vec_cls.unsqueeze(1)
+        return vec_cls
+    def append_cls(self, x):
+        batch, _, _ = x.size()
+        part_vec_cls = self.vec_cls[:batch].clone()
+        part_vec_cls = part_vec_cls.to(x.device)
+        return torch.cat([part_vec_cls, x], dim=1)
+    def forward(self, x):
+        # Spectrogram
+        x = self.spec(x)
+        x = self.to_db(x)
+        x = x.unsqueeze(1)
+        x = self.spec_bn(x)
+        # CNN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = x.squeeze(2)
+        # Get [CLS] token
+        x = x.permute(0, 2, 1)
+        x = self.append_cls(x)
+        # Transformer encoder
+        x = self.encoder(x)
+        x = x[-1]
+        x = self.pooler(x)
+        # Dense
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = nn.Sigmoid()(x)
+        return x
+class HarmonicCNN(nn.Module):
+    '''
+    Won et al. 2020
+    Data-driven harmonic filters for audio representation learning.
+    Trainable harmonic band-pass filters, short-chunk CNN.
+    '''
+    def __init__(self,
+                n_channels=128,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=128,
+                n_class=50,
+                n_harmonic=6,
+                semitone_scale=2,
+                learn_bw='only_Q'):
+        super(HarmonicCNN, self).__init__()
+        # Harmonic STFT
+        self.hstft = HarmonicSTFT(sample_rate=sample_rate,
+                                  n_fft=n_fft,
+                                  n_harmonic=n_harmonic,
+                                  semitone_scale=semitone_scale,
+                                  learn_bw=learn_bw)
+        self.hstft_bn = nn.BatchNorm2d(n_harmonic)
+        # CNN
+        self.layer1 = Conv_2d(n_harmonic, n_channels, pooling=2)
+        self.layer2 = Res_2d_mp(n_channels, n_channels, pooling=2)
+        self.layer3 = Res_2d_mp(n_channels, n_channels, pooling=2)
+        self.layer4 = Res_2d_mp(n_channels, n_channels, pooling=2)
+        self.layer5 = Conv_2d(n_channels, n_channels*2, pooling=2)
+        self.layer6 = Res_2d_mp(n_channels*2, n_channels*2, pooling=(2,3))
+        self.layer7 = Res_2d_mp(n_channels*2, n_channels*2, pooling=(2,3))
+        # Dense
+        self.dense1 = nn.Linear(n_channels*2, n_channels*2)
+        self.bn = nn.BatchNorm1d(n_channels*2)
+        self.dense2 = nn.Linear(n_channels*2, n_class)
+        self.dropout = nn.Dropout(0.5)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # Spectrogram
+        x = self.hstft_bn(self.hstft(x))
+        # CNN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = x.squeeze(2)
+        # Global Max Pooling
+        if x.size(-1) != 1:
+            x = nn.MaxPool1d(x.size(-1))(x)
+        x = x.squeeze(2)
+        # Dense
+        x = self.dense1(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.dense2(x)
+        x = nn.Sigmoid()(x)
+        return x

models/modules.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import torchaudio
+import sys
+from torch.autograd import Variable
+import math
+import librosa
+class Conv_1d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
+        super(Conv_1d, self).__init__()
+        self.conv = nn.Conv1d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn = nn.BatchNorm1d(output_channels)
+        self.relu = nn.ReLU()
+        self.mp = nn.MaxPool1d(pooling)
+    def forward(self, x):
+        out = self.mp(self.relu(self.bn(self.conv(x))))
+        return out
+class Conv_2d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
+        super(Conv_2d, self).__init__()
+        self.conv = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn = nn.BatchNorm2d(output_channels)
+        self.relu = nn.ReLU()
+        self.mp = nn.MaxPool2d(pooling)
+    def forward(self, x):
+        out = self.mp(self.relu(self.bn(self.conv(x))))
+        return out
+class Res_2d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=2):
+        super(Res_2d, self).__init__()
+        # convolution
+        self.conv_1 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn_1 = nn.BatchNorm2d(output_channels)
+        self.conv_2 = nn.Conv2d(output_channels, output_channels, shape, padding=shape//2)
+        self.bn_2 = nn.BatchNorm2d(output_channels)
+        # residual
+        self.diff = False
+        if (stride != 1) or (input_channels != output_channels):
+            self.conv_3 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+            self.bn_3 = nn.BatchNorm2d(output_channels)
+            self.diff = True
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # convolution
+        out = self.bn_2(self.conv_2(self.relu(self.bn_1(self.conv_1(x)))))
+        # residual
+        if self.diff:
+            x = self.bn_3(self.conv_3(x))
+        out = x + out
+        out = self.relu(out)
+        return out
+class Res_2d_mp(nn.Module):
+    def __init__(self, input_channels, output_channels, pooling=2):
+        super(Res_2d_mp, self).__init__()
+        self.conv_1 = nn.Conv2d(input_channels, output_channels, 3, padding=1)
+        self.bn_1 = nn.BatchNorm2d(output_channels)
+        self.conv_2 = nn.Conv2d(output_channels, output_channels, 3, padding=1)
+        self.bn_2 = nn.BatchNorm2d(output_channels)
+        self.relu = nn.ReLU()
+        self.mp = nn.MaxPool2d(pooling)
+    def forward(self, x):
+        out = self.bn_2(self.conv_2(self.relu(self.bn_1(self.conv_1(x)))))
+        out = x + out
+        out = self.mp(self.relu(out))
+        return out
+class ResSE_1d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=3):
+        super(ResSE_1d, self).__init__()
+        # convolution
+        self.conv_1 = nn.Conv1d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn_1 = nn.BatchNorm1d(output_channels)
+        self.conv_2 = nn.Conv1d(output_channels, output_channels, shape, padding=shape//2)
+        self.bn_2 = nn.BatchNorm1d(output_channels)
+        # squeeze & excitation
+        self.dense1 = nn.Linear(output_channels, output_channels)
+        self.dense2 = nn.Linear(output_channels, output_channels)
+        # residual
+        self.diff = False
+        if (stride != 1) or (input_channels != output_channels):
+            self.conv_3 = nn.Conv1d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+            self.bn_3 = nn.BatchNorm1d(output_channels)
+            self.diff = True
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+        self.mp = nn.MaxPool1d(pooling)
+    def forward(self, x):
+        # convolution
+        out = self.bn_2(self.conv_2(self.relu(self.bn_1(self.conv_1(x)))))
+        # squeeze & excitation
+        se_out = nn.AvgPool1d(out.size(-1))(out)
+        se_out = se_out.squeeze(-1)
+        se_out = self.relu(self.dense1(se_out))
+        se_out = self.sigmoid(self.dense2(se_out))
+        se_out = se_out.unsqueeze(-1)
+        out = torch.mul(out, se_out)
+        # residual
+        if self.diff:
+            x = self.bn_3(self.conv_3(x))
+        out = x + out
+        out = self.mp(self.relu(out))
+        return out
+class Conv_V(nn.Module):
+    # vertical convolution
+    def __init__(self, input_channels, output_channels, filter_shape):
+        super(Conv_V, self).__init__()
+        self.conv = nn.Conv2d(input_channels, output_channels, filter_shape,
+                              padding=(0, filter_shape[1]//2))
+        self.bn = nn.BatchNorm2d(output_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.relu(self.bn(self.conv(x)))
+        freq = x.size(2)
+        out = nn.MaxPool2d((freq, 1), stride=(freq, 1))(x)
+        out = out.squeeze(2)
+        return out
+class Conv_H(nn.Module):
+    # horizontal convolution
+    def __init__(self, input_channels, output_channels, filter_length):
+        super(Conv_H, self).__init__()
+        self.conv = nn.Conv1d(input_channels, output_channels, filter_length,
+                              padding=filter_length//2)
+        self.bn = nn.BatchNorm1d(output_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        freq = x.size(2)
+        out = nn.AvgPool2d((freq, 1), stride=(freq, 1))(x)
+        out = out.squeeze(2)
+        out = self.relu(self.bn(self.conv(out)))
+        return out
+# Modules for harmonic filters
+def hz_to_midi(hz):
+    return 12 * (torch.log2(hz) - np.log2(440.0)) + 69
+def midi_to_hz(midi):
+    return 440.0 * (2.0 ** ((midi - 69.0)/12.0))
+def note_to_midi(note):
+    return librosa.core.note_to_midi(note)
+def hz_to_note(hz):
+    return librosa.core.hz_to_note(hz)
+def initialize_filterbank(sample_rate, n_harmonic, semitone_scale):
+    # MIDI
+    # lowest note
+    low_midi = note_to_midi('C1')
+    # highest note
+    high_note = hz_to_note(sample_rate / (2 * n_harmonic))
+    high_midi = note_to_midi(high_note)
+    # number of scales
+    level = (high_midi - low_midi) * semitone_scale
+    midi = np.linspace(low_midi, high_midi, level + 1)
+    hz = midi_to_hz(midi[:-1])
+    # stack harmonics
+    harmonic_hz = []
+    for i in range(n_harmonic):
+        harmonic_hz = np.concatenate((harmonic_hz, hz * (i+1)))
+    return harmonic_hz, level
+class HarmonicSTFT(nn.Module):
+    def __init__(self,
+                 sample_rate=16000,
+                 n_fft=513,
+                 win_length=None,
+                 hop_length=None,
+                 pad=0,
+                 power=2,
+                 normalized=False,
+                 n_harmonic=6,
+                 semitone_scale=2,
+                 bw_Q=1.0,
+                 learn_bw=None):
+        super(HarmonicSTFT, self).__init__()
+        # Parameters
+        self.sample_rate = sample_rate
+        self.n_harmonic = n_harmonic
+        self.bw_alpha = 0.1079
+        self.bw_beta = 24.7
+        # Spectrogram
+        self.spec = torchaudio.transforms.Spectrogram(n_fft=n_fft, win_length=win_length,
+                                                      hop_length=None, pad=0,
+                                                      window_fn=torch.hann_window,
+                                                      power=power, normalized=normalized, wkwargs=None)
+        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+        # Initialize the filterbank. Equally spaced in MIDI scale.
+        harmonic_hz, self.level = initialize_filterbank(sample_rate, n_harmonic, semitone_scale)
+        # Center frequncies to tensor
+        self.f0 = torch.tensor(harmonic_hz.astype('float32'))
+        # Bandwidth parameters
+        if learn_bw == 'only_Q':
+            self.bw_Q = nn.Parameter(torch.tensor(np.array([bw_Q]).astype('float32')))
+        elif learn_bw == 'fix':
+            self.bw_Q = torch.tensor(np.array([bw_Q]).astype('float32'))
+    def get_harmonic_fb(self):
+        # bandwidth
+        bw = (self.bw_alpha * self.f0 + self.bw_beta) / self.bw_Q
+        bw = bw.unsqueeze(0) # (1, n_band)
+        f0 = self.f0.unsqueeze(0) # (1, n_band)
+        fft_bins = self.fft_bins.unsqueeze(1) # (n_bins, 1)
+        up_slope = torch.matmul(fft_bins, (2/bw)) + 1 - (2 * f0 / bw)
+        down_slope = torch.matmul(fft_bins, (-2/bw)) + 1 + (2 * f0 / bw)
+        fb = torch.max(self.zero, torch.min(down_slope, up_slope))
+        return fb
+    def to_device(self, device, n_bins):
+        self.f0 = self.f0.to(device)
+        self.bw_Q = self.bw_Q.to(device)
+        # fft bins
+        self.fft_bins = torch.linspace(0, self.sample_rate//2, n_bins)
+        self.fft_bins = self.fft_bins.to(device)
+        self.zero = torch.zeros(1)
+        self.zero = self.zero.to(device)
+    def forward(self, waveform):
+        # stft
+        spectrogram = self.spec(waveform)
+        # to device
+        self.to_device(waveform.device, spectrogram.size(1))
+        # triangle filter
+        harmonic_fb = self.get_harmonic_fb()
+        harmonic_spec = torch.matmul(spectrogram.transpose(1, 2), harmonic_fb).transpose(1, 2)
+        # (batch, channel, length) -> (batch, harmonic, f0, length)
+        b, c, l = harmonic_spec.size()
+        harmonic_spec = harmonic_spec.view(b, self.n_harmonic, self.level, l)
+        # amplitude to db
+        harmonic_spec = self.amplitude_to_db(harmonic_spec)
+        return harmonic_spec

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==1.12.0
+torchvision==0.13.0
+torchaudio==0.12.0
+gradio==3.1.4
+librosa==0.9.2

samples/flute.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2aaa6c5640106826a4db1d7932f9edc3b0fbb0c68cbd4e7d7d544d2fdc28af17
+size 3528044

samples/guitar_acoustic.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:450adb05b9b91dcc03b1262407b20c801769ccdca841e0f7860e5e3fe1a0a652
+size 4301040

samples/guitar_electric.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60f854cc407877512a3e68a286cfd26e95dc2f0a4e76ba313fbb3e21ddf2d2f9
+size 3492764

samples/piano.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01ba9d83ec1404ccad78a6310baba7d51583e42c20a07b7304e215a7edfe2d5e
+size 4300764

samples/violin.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:690365b52ee8ca9f7b0147247270e375d70be31512c3ae591e52bf55605d3ece
+size 19105034