submission-template

Sleeping

App Files Files Community

IlayMalinyak commited on Jan 18

Commit

b3fb4dd

1 Parent(s): 192ac3b

first commit

Browse files

Files changed (11) hide show

tasks/Modules/ResNet18.py +69 -0
tasks/Modules/__init__.py +0 -0
tasks/Modules/cnn.py +58 -0
tasks/Modules/conformer.py +584 -0
tasks/Modules/mhsa_pro.py +231 -0
tasks/audio.py +37 -6
tasks/config.yaml +66 -0
tasks/data.py +43 -0
tasks/data_utils.py +63 -0
tasks/models.py +114 -0
tasks/train.py +293 -0

tasks/Modules/ResNet18.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# https://github.com/samcw/ResNet18-Pytorch
+class ResBlock(nn.Module):
+    def __init__(self, inchannel, outchannel, stride=1):
+        super(ResBlock, self).__init__()
+        self.left = nn.Sequential(
+            nn.Conv1d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm1d(outchannel),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
+            nn.BatchNorm1d(outchannel)
+        )
+        self.shortcut = nn.Sequential()
+        if stride != 1 or inchannel != outchannel:
+            self.shortcut = nn.Sequential(
+                nn.Conv1d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm1d(outchannel)
+            )
+    def forward(self, x):
+        out = self.left(x)
+        out = out + self.shortcut(x)
+        out = F.relu(out)
+        return out
+class ResNet18(nn.Module):
+    def __init__(self, args):
+        super(ResNet18, self).__init__()
+        self.inchannel = 64
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU()
+        )
+        self.layer1 = self.make_layer(ResBlock, 64, 2, stride=1)
+        self.layer2 = self.make_layer(ResBlock, 128, 2, stride=2)
+        self.layer3 = self.make_layer(ResBlock, 256, 2, stride=2)
+        self.layer4 = self.make_layer(ResBlock, 512, 2, stride=2)
+        self.pred_layer = nn.Sequential(
+        nn.Linear(512, 512),
+        nn.SiLU(),
+        nn.Dropout(p=0.3),
+        nn.Linear(512, 1),
+    )
+        if getattr(args, 'mean_label', False):
+          self.pred_layer[3].bias.data.fill_(args.mean_label)
+    def make_layer(self, block, channels, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.inchannel, channels, stride))
+            self.inchannel = channels
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        out = self.conv1(x)
+        out = F.max_pool1d(out, 3, 2, 1)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = out.mean(-1)
+        out = self.pred_layer(out)
+        return out

tasks/Modules/__init__.py ADDED Viewed

File without changes

tasks/Modules/cnn.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+class ConvBlock(nn.Module):
+  def __init__(self, args) -> None:
+    super().__init__()
+    self.layers = nn.Sequential(
+        nn.Conv1d(in_channels=args.encoder_dim,
+                out_channels=args.encoder_dim,
+                kernel_size=args.kernel_size,
+                stride=1, padding='same', bias=False),
+        nn.BatchNorm1d(num_features=args.encoder_dim),
+        nn.SiLU(),
+    )
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    x = x.transpose(1, 2)
+    return self.layers(x).transpose(1, 2)
+class ConvBlockDecoder(nn.Module):
+  def __init__(self, args) -> None:
+    super().__init__()
+    self.layers = nn.Sequential(
+        nn.Conv1d(in_channels=args.decoder_dim,
+                out_channels=args.decoder_dim,
+                kernel_size=args.kernel_size,
+                stride=1, padding='same', bias=False),
+        nn.BatchNorm1d(num_features=args.decoder_dim),
+        nn.SiLU(),
+    )
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    x = x.transpose(1, 2)
+    return self.layers(x).transpose(1, 2)
+class ResNetLayer(nn.Module):
+  def __init__(self, args) -> None:
+    super().__init__()
+    self.conv_layer = nn.Sequential(
+        nn.Conv1d(in_channels=args.encoder_dim,
+                out_channels=args.encoder_dim,
+                kernel_size=3,
+                stride=1, padding='same', bias=False),
+        nn.BatchNorm1d(num_features=args.encoder_dim),
+        nn.SiLU(),
+    )
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    return self.conv_layer(x)+x
+class ResNetBlock(nn.Module):
+  def __init__(self, args) -> None:
+    super().__init__()
+    self.layers = nn.Sequential(*[ResNetLayer(args) for _ in range(3)])
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    return self.layers(x)

tasks/Modules/conformer.py ADDED Viewed

	@@ -0,0 +1,584 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import torch.nn.init as init
+import math
+from .mhsa_pro import MHA_rotary, MHA_decoder
+from .cnn import ConvBlock, ConvBlockDecoder
+from typing import Optional,Tuple
+class ResidualConnectionModule(nn.Module):
+    """
+    Residual Connection Module.
+    outputs = (module(inputs) x module_factor + inputs x input_factor)
+    """
+    def __init__(self, module: nn.Module, dims, args):
+        super(ResidualConnectionModule, self).__init__()
+        self.module = module
+        self.module_factor = 1
+        self.input_factor = 1
+    def forward(self, inputs: Tensor, **kwargs) -> Tensor:
+        return (self.module(inputs, **kwargs) * self.module_factor) + (inputs * self.input_factor)
+class PostNorm(nn.Module):
+    """
+    Residual Connection Module.
+    outputs = (module(inputs) x module_factor + inputs x input_factor)
+    """
+    def __init__(self, module: nn.Module, dims, args):
+        super(PostNorm, self).__init__()
+        self.module = module
+        input_factor = torch.FloatTensor(args.alpha) if getattr(args, 'alpha', None) else torch.tensor(1.)
+        self.register_buffer('input_factor', input_factor)
+        self.norm = nn.LayerNorm(dims)
+    def forward(self, inputs: Tensor, **kwargs) -> Tensor:
+        return self.norm(self.module(inputs, **kwargs) + (inputs * self.input_factor))
+class Linear(nn.Module):
+    """
+    Wrapper class of torch.nn.Linear
+    Weight initialize by xavier initialization and bias initialize to zeros.
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        init.xavier_uniform_(self.linear.weight)
+        if bias:
+            init.zeros_(self.linear.bias)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(x)
+class View(nn.Module):
+    """ Wrapper class of torch.view() for Sequential module. """
+    def __init__(self, shape: tuple, contiguous: bool = False):
+        super(View, self).__init__()
+        self.shape = shape
+        self.contiguous = contiguous
+    def forward(self, x: Tensor) -> Tensor:
+        if self.contiguous:
+            x = x.contiguous()
+        return x.view(*self.shape)
+class Transpose(nn.Module):
+    """ Wrapper class of torch.transpose() for Sequential module. """
+    def __init__(self, shape: tuple):
+        super(Transpose, self).__init__()
+        self.shape = shape
+    def forward(self, x: Tensor) -> Tensor:
+        return x.transpose(*self.shape)
+class FeedForwardModule(nn.Module):
+    """
+    Conformer Feed Forward Module follow pre-norm residual units and apply layer normalization within the residual unit
+    and on the input before the first linear layer. This module also apply Swish activation and dropout, which helps
+    regularizing the network.
+    Args:
+        encoder_dim (int): Dimension of conformer encoder
+        expansion_factor (int): Expansion factor of feed forward module.
+        dropout_p (float): Ratio of dropout
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor contains input sequences
+    Outputs: outputs
+        - **outputs** (batch, time, dim): Tensor produces by feed forward module.
+    """
+    def __init__(
+            self,
+            args,
+    ) -> None:
+        super(FeedForwardModule, self).__init__()
+        expansion_factor = 4
+        self.sequential = nn.Sequential(
+            nn.LayerNorm(args.encoder_dim),
+            Linear(args.encoder_dim, args.encoder_dim * expansion_factor, bias=True),
+            nn.SiLU(),
+            nn.Dropout(p=args.dropout_p),
+            Linear(args.encoder_dim * expansion_factor, args.encoder_dim, bias=True),
+            nn.Dropout(p=args.dropout_p),
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.sequential(inputs)
+class DepthwiseConv1d(nn.Module):
+    """
+    When groups == in_channels and out_channels == K * in_channels, where K is a positive integer,
+    this operation is termed in literature as depthwise convolution.
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = False,
+    ) -> None:
+        super(DepthwiseConv1d, self).__init__()
+        assert out_channels % in_channels == 0, "out_channels should be constant multiple of in_channels"
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
+class PointwiseConv1d(nn.Module):
+    """
+    When kernel size == 1 conv1d, this operation is termed in literature as pointwise convolution.
+    This operation often used to match dimensions.
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by pointwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = True,
+    ) -> None:
+        super(PointwiseConv1d, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
+class ConformerConvModule(nn.Module):
+    """
+    Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU).
+    This is followed by a single 1-D depthwise convolution layer. Batchnorm is  deployed just after the convolution
+    to aid training deep models.
+    Args:
+        in_channels (int): Number of channels in the input
+        kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31
+        dropout_p (float, optional): probability of dropout
+    Inputs: inputs
+        inputs (batch, time, dim): Tensor contains input sequences
+    Outputs: outputs
+        outputs (batch, time, dim): Tensor produces by conformer convolution module.
+    """
+    def __init__(
+            self,
+            args,
+    ) -> None:
+        super(ConformerConvModule, self).__init__()
+        assert (args.kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        expansion_factor = 2
+        dropout_p = 0.1
+        self.sequential = nn.Sequential(
+            nn.LayerNorm(args.encoder_dim),
+            Transpose(shape=(1, 2)),
+            PointwiseConv1d(args.encoder_dim, args.encoder_dim * expansion_factor, stride=1, padding=0, bias=True),
+            nn.GLU(dim=1),
+            DepthwiseConv1d(args.encoder_dim, args.encoder_dim, args.kernel_size, stride=1, padding=(args.kernel_size - 1) // 2),
+            nn.BatchNorm1d(args.encoder_dim),
+            nn.SiLU(),
+            PointwiseConv1d(args.encoder_dim, args.encoder_dim, stride=1, padding=0, bias=True),
+            nn.Dropout(p=dropout_p),
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.sequential(inputs).transpose(1, 2)
+class PositionalEncoding(nn.Module):
+    """
+    Positional Encoding proposed in "Attention Is All You Need".
+    Since transformer contains no recurrence and no convolution, in order for the model to make
+    use of the order of the sequence, we must add some positional information.
+    "Attention Is All You Need" use sine and cosine functions of different frequencies:
+        PE_(pos, 2i)    =  sin(pos / power(10000, 2i / d_model))
+        PE_(pos, 2i+1)  =  cos(pos / power(10000, 2i / d_model))
+    """
+    def __init__(self, d_model: int = 128, max_len: int = 10000) -> None:
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, d_model, requires_grad=False)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, length: int) -> Tensor:
+        return self.pe[:, :length]
+class RelativeMultiHeadAttention(nn.Module):
+    """
+    Multi-head attention with relative positional encoding.
+    This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+    Args:
+        d_model (int): The dimension of model
+        num_heads (int): The number of attention heads.
+        dropout_p (float): probability of dropout
+    Inputs: query, key, value, pos_embedding, mask
+        - **query** (batch, time, dim): Tensor containing query vector
+        - **key** (batch, time, dim): Tensor containing key vector
+        - **value** (batch, time, dim): Tensor containing value vector
+        - **pos_embedding** (batch, time, dim): Positional embedding tensor
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+    Returns:
+        - **outputs**: Tensor produces by relative multi head attention module.
+    """
+    def __init__(
+            self,
+            encoder_dim: int = 128,
+            num_heads: int = 8,
+            dropout_p: float = 0.1
+    ):
+        super(RelativeMultiHeadAttention, self).__init__()
+        assert encoder_dim % num_heads == 0, "d_model % num_heads should be zero."
+        self.d_model = encoder_dim
+        self.d_head = int(encoder_dim / num_heads)
+        self.num_heads = num_heads
+        self.sqrt_dim = math.sqrt(encoder_dim)
+        self.query_proj = Linear(encoder_dim, encoder_dim)
+        self.key_proj = Linear(encoder_dim, encoder_dim)
+        self.value_proj = Linear(encoder_dim, encoder_dim)
+        self.pos_proj = Linear(encoder_dim, encoder_dim, bias=False)
+        self.dropout = nn.Dropout(p=dropout_p)
+        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        torch.nn.init.xavier_uniform_(self.u_bias)
+        torch.nn.init.xavier_uniform_(self.v_bias)
+        self.out_proj = Linear(encoder_dim, encoder_dim)
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            pos_embedding: Tensor,
+            mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        batch_size = value.size(0)
+        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
+        query = query.view(batch_size, -1, self.num_heads, self.d_head)
+        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
+        content_score = torch.matmul((query + self.u_bias).transpose(1, 2), key.transpose(2, 3))
+        pos_score = torch.matmul((query + self.v_bias).transpose(1, 2), pos_embedding.permute(0, 2, 3, 1))
+        # content_score = torch.matmul((query).transpose(1, 2), key.transpose(2, 3))
+        # pos_score = torch.matmul((query).transpose(1, 2), pos_embedding.permute(0, 2, 3, 1))
+        #Q(B,numheads,length,d_head)*PE(B,numheads,d_heads,length) = posscore(B,num_heads,length,length)
+        pos_score = self._relative_shift(pos_score)
+        score = (content_score + pos_score) / self.sqrt_dim
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            score.masked_fill_(mask, -1e9)
+        score = F.softmax(score, -1)
+        attn = self.dropout(score)
+        context = torch.matmul(attn, value).transpose(1, 2)
+        context = context.contiguous().view(batch_size, -1, self.d_model)
+        return self.out_proj(context)
+    def _relative_shift(self, pos_score: Tensor) -> Tensor:
+        batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
+        zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
+        padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
+        padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
+        pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
+        #shift position score a unit along length axis and leave a blank row.
+        return pos_score
+class MultiHeadedSelfAttentionModule(nn.Module):
+    """
+    Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
+    the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
+    module to generalize better on different input length and the resulting encoder is more robust to the variance of
+    the utterance length. Conformer use prenorm residual units with dropout which helps training
+    and regularizing deeper models.
+    Args:
+        d_model (int): The dimension of model
+        num_heads (int): The number of attention heads.
+        dropout_p (float): probability of dropout
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs, mask
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+    Returns:
+        - **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
+    """
+    def __init__(self, args):
+        super(MultiHeadedSelfAttentionModule, self).__init__()
+        dropout_p = 0.1
+        self.positional_encoding = PositionalEncoding(args.encoder_dim)
+        self.layer_norm = nn.LayerNorm(args.encoder_dim)
+        self.attention = RelativeMultiHeadAttention(args.encoder_dim, args.num_heads, args.dropout_p)
+        self.dropout = nn.Dropout(p=dropout_p)
+    def forward(self, inputs: Tensor, mask: Optional[Tensor] = None):
+        batch_size, seq_length, _ = inputs.size()
+        pos_embedding = self.positional_encoding(seq_length)
+        pos_embedding = pos_embedding.repeat(batch_size, 1, 1)
+        inputs = self.layer_norm(inputs)
+        outputs = self.attention(inputs, inputs, inputs, pos_embedding=pos_embedding, mask=mask)
+        return self.dropout(outputs)
+class ConformerBlock(nn.Module):
+    """
+    Conformer block contains two Feed Forward modules sandwiching the Multi-Headed Self-Attention module
+    and the Convolution module. This sandwich structure is inspired by Macaron-Net, which proposes replacing
+    the original feed-forward layer in the Transformer block into two half-step feed-forward layers,
+    one before the attention layer and one after.
+    Args:
+        encoder_dim (int, optional): Dimension of conformer encoder
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, time, dim): Tensor produces by conformer block.
+    """
+    def __init__(
+            self,
+            args
+    ):
+        super(ConformerBlock, self).__init__()
+        norm_dict = {
+            'shortcut': ResidualConnectionModule,
+            'postnorm': PostNorm
+        }
+        block_dict = {
+            'ffn': FeedForwardModule,
+            'mhsa': MultiHeadedSelfAttentionModule,
+            'mhsa_pro': MHA_rotary,
+            'conv': ConvBlock,
+            'conformerconv': ConformerConvModule
+        }
+        self.modlist = nn.ModuleList([norm_dict[args.norm](block_dict[block](args), args.encoder_dim, args) for block in args.encoder]\
+            )
+    def forward(self, x: Tensor, RoPE, key_padding_mask=None) -> Tensor:
+        for m in self.modlist:
+            if isinstance(m.module, MHA_rotary):
+                x = m(x, RoPE=RoPE, key_padding_mask=key_padding_mask)
+            else:
+                x = m(x)
+        return x
+class DecoderBlock(nn.Module):
+    """
+    Decoder block contains two Feed Forward modules sandwiching the Multi-Headed Self-Attention module
+    and the Convolution module. This sandwich structure is inspired by Macaron-Net, which proposes replacing
+    the original feed-forward layer in the Transformer block into two half-step feed-forward layers,
+    one before the attention layer and one after.
+    Args:
+        encoder_dim (int, optional): Dimension of conformer encoder
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, time, dim): Tensor produces by conformer block.
+    """
+    def __init__(
+            self,
+            args
+    ):
+        super(DecoderBlock, self).__init__()
+        norm_dict = {
+            'shortcut': ResidualConnectionModule,
+            'postnorm': PostNorm
+        }
+        block_dict = {
+            'ffn': FeedForwardModule,
+            'mhsa': MultiHeadedSelfAttentionModule,
+            'mhsa_pro': MHA_rotary,
+            'mhsa_decoder': MHA_decoder,
+            'conv': ConvBlockDecoder,
+            'conformerconv': ConformerConvModule
+        }
+        self.modlist = nn.ModuleList([norm_dict[args.norm](block_dict[block](args),args.decoder_dim, args) for block in args.decoder]\
+            )
+    def forward(self, x: Tensor, memory:Tensor, RoPE, key_padding_mask=None) -> Tensor:
+        for m in self.modlist:
+            if isinstance(m.module, MHA_decoder):
+                x = m(x, memory=memory, RoPE=RoPE, key_padding_mask=key_padding_mask)
+            elif isinstance(m.module, MHA_rotary):
+                x = m(x, RoPE=RoPE, key_padding_mask=key_padding_mask).transpose(0,1)
+            else:
+                x = m(x)
+        return x
+class ConformerEncoder(nn.Module):
+    """
+    Conformer encoder first processes the input with a convolution subsampling layer and then
+    with a number of conformer blocks.
+    Args:
+        input_dim (int, optional): Dimension of input vector
+        encoder_dim (int, optional): Dimension of conformer encoder
+        num_layers (int, optional): Number of conformer blocks
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **input_lengths** (batch): list of sequence input lengths
+    Returns: outputs, output_lengths
+        - **outputs** (batch, out_channels, time): Tensor produces by conformer encoder.
+        - **output_lengths** (batch): list of sequence output lengths
+    """
+    def __init__(
+            self,
+            args,
+    ):
+        super(ConformerEncoder, self).__init__()
+        self.blocks = nn.ModuleList([ConformerBlock(
+            args) for _ in range(args.num_layers)])
+    def forward(self, x: Tensor,  RoPE=None, key_padding_mask=None) -> Tuple[Tensor, Tensor]:
+        """
+        Forward propagate a `inputs` for  encoder training.
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoder. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+        Returns:
+            (Tensor, Tensor)
+            * outputs (torch.FloatTensor): A output sequence of encoder. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            * output_lengths (torch.LongTensor): The length of output tensor. ``(batch)``
+        """
+        for block in self.blocks:
+            x = block(x, RoPE=RoPE, key_padding_mask=key_padding_mask)
+        return x
+class ConformerDecoder(nn.Module):
+    """
+    Conformer encoder first processes the input with a convolution subsampling layer and then
+    with a number of conformer blocks.
+    Args:
+        input_dim (int, optional): Dimension of input vector
+        encoder_dim (int, optional): Dimension of conformer encoder
+        num_layers (int, optional): Number of conformer blocks
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+        device (torch.device): torch device (cuda or cpu)
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **input_lengths** (batch): list of sequence input lengths
+    Returns: outputs, output_lengths
+        - **outputs** (batch, out_channels, time): Tensor produces by conformer encoder.
+        - **output_lengths** (batch): list of sequence output lengths
+    """
+    def __init__(
+            self,
+            args,
+    ):
+        super(ConformerDecoder, self).__init__()
+        self.blocks = nn.ModuleList([DecoderBlock(
+            args) for _ in range(args.num_decoder_layers)])
+    def forward(self, x: Tensor, memory: Tensor, RoPE=None, key_padding_mask=None) -> Tuple[Tensor, Tensor]:
+        """
+        Forward propagate a `inputs` for  encoder training.
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoder. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+        Returns:
+            (Tensor, Tensor)
+            * outputs (torch.FloatTensor): A output sequence of encoder. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            * output_lengths (torch.LongTensor): The length of output tensor. ``(batch)``
+        """
+        for block in self.blocks:
+            x = block(x, memory, RoPE=RoPE, key_padding_mask=key_padding_mask)
+        return x

tasks/Modules/mhsa_pro.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from typing import Optional,Tuple
+import math
+import logging
+logger = logging.getLogger(__name__)
+rwkv_emb_scale = 0.4 # try 0.4 for char-level english. try 1.0 for chinese.
+rwkv_layer_decay = 1.0 # decay weights in higher layers. try 0.5 ~ 1.0.
+class AttentionConfig:
+  def __init__(self, ctx_len=100, **kwargs):
+    self.ctx_len = ctx_len
+    for k,v in kwargs.items():
+        setattr(self, k, v)
+########################################################################################################
+# MHA_rotary: Multi-head Attention + Rotary Encoding + GeGLU FFN
+########################################################################################################
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+    def forward(self, x, seq_len=None):
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos()
+            self.sin_cached = emb.sin()
+        return torch.stack([self.cos_cached, self.sin_cached])
+class ContinuousRotaryEmbedding(torch.nn.Module):
+    '''Continuous rotary position embedding'''
+    def __init__(self, dim, sequence_scale):
+        super().__init__()
+        base=10000
+        self.sequence_scale = sequence_scale
+        self.register_buffer('inv_freq', 1. / (base ** (torch.arange(0, dim, 2))))
+    def forward(self, t):
+        t = (t + 0.5)* self.sequence_scale
+        freqs = torch.einsum('ij,k->ijk', t, self.inv_freq) # freqs: [B, L, dim//2]
+        emb = torch.cat((freqs, freqs), dim=-1).unsqueeze(1) # emb: [B, 1, L, dim], 1 for broadcast in head_num dim
+        return torch.stack([emb.cos(), emb.sin()])
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), -1)
+@torch.jit.script
+def apply_rotary_pos_emb(q, k, cos, sin):
+    cos, sin = cos[...,:q.shape[2],:], sin[...,:q.shape[2],:]
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+class MHA_rotary(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.collect_attention_map = False
+        self.attention_map = None
+        assert args.encoder_dim % args.num_heads == 0
+        self.num_heads = args.num_heads
+        self.head_size = args.encoder_dim // args.num_heads
+        if args.timeshift:
+            self.time_shift = nn.ZeroPad2d((0,0,1,0))
+        self.query = nn.Linear(args.encoder_dim, args.encoder_dim)
+        self.key = nn.Linear(args.encoder_dim, args.encoder_dim)
+        self.value = nn.Linear(args.encoder_dim, args.encoder_dim)
+        # self.register_buffer("mask", torch.tril(torch.ones(config.ctx_len, config.ctx_len)))
+        self.rotary_ndims = int(self.head_size * 0.5)
+        self.rotary_emb = RotaryEmbedding(self.rotary_ndims)
+        self.output = nn.Linear(args.encoder_dim, args.encoder_dim)
+    def forward(self, x, RoPE, key_padding_mask=None):
+        B, T, C = x.size()
+        if hasattr(self, 'time_shift'):
+            x = torch.cat([self.time_shift(x)[:, :-1, :C//2], x[:, :, C//2:]], dim = -1)
+        q = self.query(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        k = self.key(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)         # (B, T, C) -> (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:]
+        k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:]
+        # cos, sin = self.rotary_emb(q, seq_len=T)
+        cos, sin = RoPE
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)                                     # rotary encoding
+        q = torch.cat((q, query_pass), dim=-1)
+        k = torch.cat((k, key_pass), dim=-1)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))                 # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask[:, None, None, :]           # (B, T) -> (B, 1, 1, T)
+            att = att.masked_fill(key_padding_mask == 0, float('-inf'))
+        att = F.softmax(att, dim = -1)                                                  # softmax
+        x = att @ v                                                                     # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)
+        x = x.transpose(1, 2).contiguous().view(B, T, -1)                               # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)
+        x = self.output(x)
+        if self.collect_attention_map:
+            self.attention_map = att
+        return x
+class MHA_decoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.collect_attention_map = False
+        self.attention_map = None
+        assert args.encoder_dim % args.num_heads == 0
+        self.num_heads = args.num_heads
+        self.head_size = args.decoder_dim // args.num_heads
+        if args.timeshift:
+            self.time_shift = nn.ZeroPad2d((0,0,1,0))
+        self.query = nn.Linear(args.decoder_dim, args.decoder_dim)
+        self.key = nn.Linear(args.decoder_dim, args.decoder_dim)
+        self.value = nn.Linear(args.decoder_dim, args.decoder_dim)
+        # self.register_buffer("mask", torch.tril(torch.ones(config.ctx_len, config.ctx_len)))
+        self.rotary_ndims = int(self.head_size * 0.5)
+        self.rotary_emb = RotaryEmbedding(self.rotary_ndims)
+        self.output = nn.Linear(args.decoder_dim, args.decoder_dim)
+    def forward(self, x, memory,RoPE, key_padding_mask=None):
+        B, T, C = x.size()
+        _, L, M = memory.size()
+        # print("x size: ", x.size(), 'memory size: ', memory.size())
+        # print('B, T, C: ', B, T, C, 'L: ', L)
+        q = self.query(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        k = self.key(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)         # (B, T, C) -> (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:]
+        k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:]
+        # cos, sin = self.rotary_emb(q, seq_len=T)
+        cos, sin = RoPE
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)                                     # rotary encoding
+        q = torch.cat((q, query_pass), dim=-1)
+        k = torch.cat((k, key_pass), dim=-1)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))                 # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask[:, None, None, :]           # (B, T) -> (B, 1, 1, T)
+            att = att.masked_fill(key_padding_mask == 0, float('-inf'))
+        att = F.softmax(att, dim = -1)                                                  # softmax
+        x = att @ v
+        # print("after attention vals: ", x.shape)                                                                   # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)
+        x = x.transpose(1, 2).contiguous().view(B, T, -1)                               # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)
+        # x = self.output(x)
+        # print("after linear: ", x.shape)                                                                   # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)
+        # cross attention:
+        q = self.query(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        k = self.key(memory).view(B, L, self.num_heads, self.head_size).transpose(1, 2)         # (B, T, C) -> (B, nh, T, hs)
+        v = self.value(memory).view(B, L, self.num_heads, self.head_size).transpose(1, 2)       # (B, T, C) -> (B, nh, T, hs)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))                 # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)
+        # print("att size: ", att.size())
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask[:, None, None, :]           # (B, T) -> (B, 1, 1, T)
+            att = att.masked_fill(key_padding_mask == 0, float('-inf'))
+        att = F.softmax(att, dim = -1)                                                  # softmax
+        x = att @ v                                                                     # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)
+        # print("x deocder size: ", x.size())
+        x = x.transpose(1, 2).contiguous().view(B, T, -1)                               # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)
+        # print("x deocder size transposed: ", x.size())
+        x = self.output(x)
+        if self.collect_attention_map:
+            self.attention_map = att
+        return x
+    class GeGLU(torch.nn.Module):
+        def __init__(self, config, layer_id, time_shift = False):
+            super().__init__()
+            self.layer_id = layer_id
+            if time_shift:
+                self.time_shift = nn.ZeroPad2d((0,0,1,0))
+            hidden_sz = 3 * config.n_ffn
+            self.key = nn.Linear(config.n_embd, hidden_sz)
+            self.value = nn.Linear(config.n_embd, hidden_sz)
+            self.weight = nn.Linear(hidden_sz, config.n_embd)
+        def forward(self, x):
+            B, T, C = x.size()
+            if hasattr(self, 'time_shift'):
+                x = torch.cat([self.time_shift(x)[:, :-1, :C//2], x[:, :, C//2:]], dim = -1)
+            k = self.key(x)
+            v = self.value(x)
+            y = self.weight(F.gelu(k) * v)
+            return y

tasks/audio.py CHANGED Viewed

@@ -2,11 +2,19 @@ from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import random
 import os
 from .utils.evaluation import AudioEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 from dotenv import load_dotenv
 load_dotenv()
@@ -43,20 +51,43 @@ async def evaluate_audio(request: AudioEvaluationRequest):
     # Split dataset
     train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    #--------------------------------------------------------------------------------------------
     # Make random predictions (placeholder for actual model inference)
     true_labels = test_dataset["label"]
-    predictions = [random.randint(0, 1) for _ in range(len(true_labels))]
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
     #--------------------------------------------------------------------------------------------

 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
+import numpy as np
 import random
 import os
+import torch
+from torch.utils.data import DataLoader
 from .utils.evaluation import AudioEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
+from data import FFTDataset
+from models import DualEncoder
+from train import Trainer
+from data_utils import collate_fn, Container
+import yaml
 from dotenv import load_dotenv
 load_dotenv()
     # Split dataset
     train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    #--------------------------------------------------------------------------------------------
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    args_path = 'config.yaml'
+    data_args = Container(**yaml.safe_load(open(args_path, 'r'))['Data'])
+    model_args = Container(**yaml.safe_load(open(args_path, 'r'))['CNNEncoder'])
+    model_args_f = Container(**yaml.safe_load(open(args_path, 'r'))['CNNEncoder_f'])
+    conformer_args = Container(**yaml.safe_load(open(args_path, 'r'))['Conformer'])
+    test_dataset = FFTDataset(test_dataset)
+    test_dl = DataLoader(test_dataset, batch_size=data_args.batch_size, collate_fn=collate_fn)
+    model = DualEncoder(model_args, model_args_f, conformer_args)
+    model = model.to(device)
+    missing, unexpected = model.load_state_dict(torch.load(model_args.checkpoint_path))
+    loss_fn = torch.nn.BCEWithLogitsLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
+    trainer = Trainer(model=model, optimizer=optimizer,
+                      criterion=loss_fn, output_dim=model_args.output_dim, scaler=None,
+                      scheduler=None, train_dataloader=None,
+                      val_dataloader=None, device=device,
+                      exp_num='test', log_path=None,
+                      range_update=None,
+                      accumulation_step=1, max_iter=np.inf,
+                      exp_name=f"frugal_cnnencoder_inference")
+    predictions, acc = trainer.predict(test_dl, device=device)
     # Make random predictions (placeholder for actual model inference)
     true_labels = test_dataset["label"]
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
     #--------------------------------------------------------------------------------------------

tasks/config.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+Data:
+  # Basics
+  log_dir: '/data/frugal/logs'
+  # Data
+  dataset: "KeplerDataset"
+  data_dir: '/data/lightPred/data'
+  model_name: "CNNEncoder"
+  batch_size: 16
+  num_epochs: 1000
+  exp_num: 2
+  max_len_spectra: 4096
+  max_days_lc: 270
+  lc_freq: 0.0208
+  create_umap: True
+CNNEncoder:
+  # Model
+  in_channels: 1
+  num_layers: 4
+  stride: 1
+  encoder_dims: [32,64,128,256]
+  kernel_size: 3
+  dropout_p: 0.3
+  output_dim: 2
+  beta: 1
+  load_checkpoint: True
+  checkpoint_num: 1
+  activation: "silu"
+  sine_w0: 1.0
+  avg_output: True
+  checkpoint_path: 'logs/frugal_2025-01-10/frugal_cnnencoder_2.pth'
+CNNEncoder_f:
+  # Model
+  in_channels: 1
+  num_layers: 4
+  stride: 1
+  encoder_dims: [32,64,128]
+  kernel_size: 3
+  dropout_p: 0.3
+  output_dim: 2
+  beta: 1
+  load_checkpoint: True
+  checkpoint_num: 1
+  activation: "silu"
+  sine_w0: 1.0
+  avg_output: True
+Conformer:
+  encoder: ["mhsa_pro", "conv"]
+  timeshift: false
+  num_layers: 8
+  encoder_dim: 128
+  num_heads: 8
+  kernel_size: 3
+  dropout_p: 0.2
+  norm: "postnorm"
+Optimization:
+  # Optimization
+  max_lr: 1e-5
+  weight_decay: 5e-6
+  warmup_pct: 0.3
+  steps_per_epoch: 3500

tasks/data.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from torch.utils.data import IterableDataset
+from torch.fft import fft
+from itertools import tee
+import random
+import torchaudio.transforms as T
+class SplitDataset(IterableDataset):
+    def __init__(self, dataset, is_train=True, train_ratio=0.8):
+        self.dataset = dataset
+        self.is_train = is_train
+        self.train_ratio = train_ratio
+    def __iter__(self):
+        count = 0
+        for item in self.dataset:
+            # For first train_ratio portion of items, yield to train
+            # For remaining items, yield to validation
+            is_train_item = count < int(self.train_ratio * 100)
+            if is_train_item == self.is_train:
+                yield item
+            count = (count + 1) % 100
+class FFTDataset(IterableDataset):
+    def __init__(self, original_dataset, orig_sample_rate=12000, target_sample_rate=6000):
+        self.dataset = original_dataset
+        self.resampler = T.Resample(orig_freq=orig_sample_rate, new_freq=target_sample_rate)
+    def __iter__(self):
+        for item in self.dataset:
+            # Assuming your audio data is in item['audio']
+            # Modify this based on your actual data structure
+            audio_data = torch.tensor(item['audio']['array']).float()
+            if len(audio_data) == 0:
+                continue
+            resampled_audio = self.resampler(audio_data)
+            fft_data = fft(resampled_audio)
+            # Update the item with FFT data
+            item['audio']['fft'] = fft_data
+            yield item

tasks/data_utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+import torch.distributed as dist
+from torch.nn.utils.rnn import pad_sequence
+def collate_fn(batch):
+    # Extract audio arrays and FFT data from the batch of dictionaries
+    audio_arrays = [torch.tensor(item['audio']['array']) for item in batch]
+    fft_arrays = [torch.tensor(item['audio']['fft']) for item in batch]
+    labels = [torch.tensor(item['label']) for item in batch]
+    # Pad both sequences
+    padded_audio = pad_sequence(audio_arrays, batch_first=True, padding_value=0)
+    padded_fft = pad_sequence(fft_arrays, batch_first=True, padding_value=0)
+    # Return as dictionary with the same structure
+    return {
+        'audio': {
+            'array': padded_audio,
+            'fft': padded_fft
+        },
+        'label': torch.stack(labels)
+    }
+class Container(object):
+  '''A container class that can be used to store any attributes.'''
+  def __init__(self, **kwargs):
+    self.__dict__.update(kwargs)
+  def load_dict(self, dict):
+    for key, value in dict.items():
+      if getattr(self, key, None) is None:
+        setattr(self, key, value)
+  def print_attributes(self):
+    for key, value in vars(self).items():
+      print(f"{key}: {value}")
+  def get_dict(self):
+    return self.__dict__
+def setup():
+    """
+    Setup the distributed training environment.
+    """
+    world_size    = int(os.environ["WORLD_SIZE"])
+    rank          = int(os.environ["SLURM_PROCID"])
+    jobid         = int(os.environ["SLURM_JOBID"])
+    gpus_per_node = torch.cuda.device_count()
+    print('jobid ', jobid)
+    print('gpus per node ', gpus_per_node)
+    print(f"Hello from rank {rank} of {world_size} where there are" \
+          f" {gpus_per_node} allocated GPUs per node. ", flush=True)
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    if rank == 0: print(f"Group initialized? {dist.is_initialized()}", flush=True)
+    local_rank = rank - gpus_per_node * (rank // gpus_per_node)
+    torch.cuda.set_device(local_rank)
+    print(f"rank: {rank}, local_rank: {local_rank}")
+    return local_rank, world_size, gpus_per_node

tasks/models.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+from Modules.conformer import ConformerEncoder, ConformerDecoder
+from Modules.mhsa_pro import RotaryEmbedding, ContinuousRotaryEmbedding
+class ConvBlock(nn.Module):
+  def __init__(self, args, num_layer) -> None:
+    super().__init__()
+    if args.activation == 'silu':
+        self.activation = nn.SiLU()
+    else:
+        self.activation = nn.ReLU()
+    in_channels = args.encoder_dims[num_layer-1] if num_layer < len(args.encoder_dims) else args.encoder_dims[-1]
+    out_channels = args.encoder_dims[num_layer] if num_layer < len(args.encoder_dims) else args.encoder_dims[-1]
+    self.layers = nn.Sequential(
+        nn.Conv1d(in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=args.kernel_size,
+                stride=1, padding='same', bias=False),
+        nn.BatchNorm1d(num_features=out_channels),
+        self.activation,
+    )
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    return self.layers(x)
+class CNNEncoder(nn.Module):
+    def __init__(self, args) -> None:
+        super().__init__()
+        print("Using CNN encoder wit activation: ", args.activation, 'args avg_output: ', args.avg_output)
+        if args.activation == 'silu':
+            self.activation = nn.SiLU()
+        else:
+            self.activation = nn.ReLU()
+        self.embedding = nn.Sequential(nn.Conv1d(in_channels = args.in_channels,
+                kernel_size=3, out_channels = args.encoder_dims[0], stride=1, padding = 'same', bias = False),
+                        nn.BatchNorm1d(args.encoder_dims[0]),
+                        self.activation,
+        )
+        self.layers = nn.ModuleList([ConvBlock(args, i+1)
+        for i in range(args.num_layers)])
+        self.pool = nn.MaxPool1d(2)
+        self.output_dim = args.encoder_dims[-1]
+        self.min_seq_len = 2
+        self.avg_output = args.avg_output
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if len(x.shape)==2:
+            x = x.unsqueeze(1)
+        if len(x.shape)==3 and x.shape[-1]==1:
+            x = x.permute(0,2,1)
+        x = self.embedding(x)
+        for m in self.layers:
+            x = m(x)
+            if x.shape[-1] > self.min_seq_len:
+                x = self.pool(x)
+        if self.avg_output:
+            x = x.mean(dim=-1)
+        return x
+class MultiEncoder(nn.Module):
+    def __init__(self, args, conformer_args):
+        super().__init__()
+        self.backbone = CNNEncoder(args)
+        self.backbone.avg_output = False
+        self.head_size = conformer_args.encoder_dim // conformer_args.num_heads
+        self.rotary_ndims = int(self.head_size * 0.5)
+        self.pe = RotaryEmbedding(self.rotary_ndims)
+        self.encoder = ConformerEncoder(conformer_args)
+        self.output_dim = conformer_args.encoder_dim
+        self.avg_output = args.avg_output
+    def forward(self, x):
+        # Store backbone output in a separate tensor
+        backbone_out = self.backbone(x)
+        # Create x_enc from backbone_out
+        if len(backbone_out.shape) == 2:
+            x_enc = backbone_out.unsqueeze(1).clone()
+        else:
+            x_enc = backbone_out.permute(0,2,1).clone()
+        RoPE = self.pe(x_enc, x_enc.shape[1])
+        x_enc = self.encoder(x_enc, RoPE)
+        if len(x_enc.shape) == 3:
+            if self.avg_output:
+                x_enc = x_enc.sum(dim=1)
+            else:
+                x_enc = x_enc.permute(0,2,1)
+        # Return x_enc and the original backbone output
+        return x_enc, backbone_out
+class DualEncoder(nn.Module):
+    def __init__(self, args_x, args_f, conformer_args) -> None:
+        super().__init__()
+        self.encoder_x = CNNEncoder(args_x)
+        self.encoder_f = MultiEncoder(args_f, conformer_args)
+        total_output_dim = args_x.encoder_dims[-1] + args_f.encoder_dims[-1]
+        self.regressor = nn.Sequential(
+            nn.Linear(total_output_dim, total_output_dim//2),
+            nn.BatchNorm1d(total_output_dim//2),
+            nn.SiLU(),
+            nn.Linear(total_output_dim//2, 1)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1 = self.encoder_x(x)
+        x2, _ = self.encoder_f(x)
+        logits = torch.cat([x1, x2], dim=-1)
+        return self.regressor(logits).squeeze()

tasks/train.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import torch
+from torch.cuda.amp import autocast
+import numpy as np
+import time
+import os
+import yaml
+from matplotlib import pyplot as plt
+import glob
+from collections import OrderedDict
+from tqdm import tqdm
+import torch.distributed as dist
+import umap
+class Trainer(object):
+    """
+    A class that encapsulates the training loop for a PyTorch model.
+    """
+    def __init__(self, model, optimizer, criterion, train_dataloader, device, world_size=1, output_dim=2,
+                 scheduler=None, val_dataloader=None,   max_iter=np.inf, scaler=None,
+                  grad_clip=False, exp_num=None, log_path=None, exp_name=None, plot_every=None,
+                   cos_inc=False, range_update=None, accumulation_step=1, wandb_log=False, num_quantiles=1,
+                   update_func=lambda x: x):
+        self.model = model
+        self.optimizer = optimizer
+        self.criterion = criterion
+        self.scaler = scaler
+        self.grad_clip = grad_clip
+        self.cos_inc = cos_inc
+        self.output_dim = output_dim
+        self.scheduler = scheduler
+        self.train_dl = train_dataloader
+        self.val_dl = val_dataloader
+        self.train_sampler = self.get_sampler_from_dataloader(train_dataloader)
+        self.val_sampler = self.get_sampler_from_dataloader(val_dataloader)
+        self.max_iter = max_iter
+        self.device = device
+        self.world_size = world_size
+        self.exp_num = exp_num
+        self.exp_name = exp_name
+        self.log_path = log_path
+        self.best_state_dict = None
+        self.plot_every = plot_every
+        self.logger = None
+        self.range_update = range_update
+        self.accumulation_step = accumulation_step
+        self.wandb = wandb_log
+        self.num_quantiles = num_quantiles
+        self.update_func = update_func
+        # if log_path is not None:
+        #     self.logger =SummaryWriter(f'{self.log_path}/exp{self.exp_num}')
+        #     # print(f"logger path: {self.log_path}/exp{self.exp_num}")
+        # print("logger is: ", self.logger)
+    def get_sampler_from_dataloader(self, dataloader):
+        if hasattr(dataloader, 'sampler'):
+            if isinstance(dataloader.sampler, torch.utils.data.DistributedSampler):
+                return dataloader.sampler
+            elif hasattr(dataloader.sampler, 'sampler'):
+                return dataloader.sampler.sampler
+        if hasattr(dataloader, 'batch_sampler') and hasattr(dataloader.batch_sampler, 'sampler'):
+            return dataloader.batch_sampler.sampler
+        return None
+    def fit(self, num_epochs, device,  early_stopping=None, only_p=False, best='loss', conf=False):
+        """
+        Fits the model for the given number of epochs.
+        """
+        min_loss = np.inf
+        best_acc = 0
+        train_loss, val_loss,  = [], []
+        train_acc, val_acc = [], []
+        lrs = []
+        # self.optim_params['lr_history'] = []
+        epochs_without_improvement = 0
+        main_proccess = (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0) or self.device == 'cpu'
+        print(f"Starting training for {num_epochs} epochs")
+        print("is main process: ", main_proccess, flush=True)
+        global_time = time.time()
+        self.epoch = 0
+        for epoch in range(num_epochs):
+            self.epoch = epoch
+            start_time = time.time()
+            plot = (self.plot_every is not None) and (epoch % self.plot_every == 0)
+            t_loss, t_acc = self.train_epoch(device, epoch=epoch)
+            t_loss_mean = np.nanmean(t_loss)
+            train_loss.extend(t_loss)
+            global_train_accuracy, global_train_loss = self.process_loss(t_acc, t_loss_mean)
+            if main_proccess:  # Only perform this on the master GPU
+                train_acc.append(global_train_accuracy.mean().item())
+            v_loss, v_acc = self.eval_epoch(device, epoch=epoch)
+            v_loss_mean = np.nanmean(v_loss)
+            val_loss.extend(v_loss)
+            global_val_accuracy, global_val_loss = self.process_loss(v_acc, v_loss_mean)
+            if main_proccess:  # Only perform this on the master GPU
+                val_acc.append(global_val_accuracy.mean().item())
+                current_objective = global_val_loss if best == 'loss' else global_val_accuracy.mean()
+                improved = False
+                if best == 'loss':
+                    if current_objective < min_loss:
+                        min_loss = current_objective
+                        improved = True
+                else:
+                    if current_objective > best_acc:
+                        best_acc = current_objective
+                        improved = True
+                if improved:
+                    model_name = f'{self.log_path}/{self.exp_num}/{self.exp_name}.pth'
+                    print(f"saving model at {model_name}...")
+                    torch.save(self.model.state_dict(), model_name)
+                    self.best_state_dict = self.model.state_dict()
+                    epochs_without_improvement = 0
+                else:
+                    epochs_without_improvement += 1
+                current_lr = self.optimizer.param_groups[0]['lr'] if self.scheduler is None \
+                            else self.scheduler.get_last_lr()[0]
+                lrs.append(current_lr)
+                print(f'Epoch {epoch}, lr {current_lr}, Train Loss: {global_train_loss:.6f}, Val Loss:'\
+                f'{global_val_loss:.6f}, Train Acc: {global_train_accuracy.round(decimals=4).tolist()}, '\
+                f'Val Acc: {global_val_accuracy.round(decimals=4).tolist()},'\
+                  f'Time: {time.time() - start_time:.2f}s, Total Time: {(time.time() - global_time)/3600} hr', flush=True)
+                if epoch % 10 == 0:
+                    print(os.system('nvidia-smi'))
+                if epochs_without_improvement == early_stopping:
+                    print('early stopping!', flush=True)
+                    break
+                if time.time() - global_time > (23.83 * 3600):
+                    print("time limit reached")
+                    break
+        return {"num_epochs":num_epochs, "train_loss": train_loss,
+                 "val_loss": val_loss, "train_acc": train_acc, "val_acc": val_acc, "lrs": lrs}
+    def process_loss(self, acc, loss_mean):
+        if  torch.cuda.is_available() and torch.distributed.is_initialized():
+            global_accuracy = torch.tensor(acc).cuda()  # Convert accuracy to a tensor on the GPU
+            torch.distributed.reduce(global_accuracy, dst=0, op=torch.distributed.ReduceOp.SUM)
+            global_loss = torch.tensor(loss_mean).cuda()  # Convert loss to a tensor on the GPU
+            torch.distributed.reduce(global_loss, dst=0, op=torch.distributed.ReduceOp.SUM)
+            # Divide both loss and accuracy by world size
+            world_size = torch.distributed.get_world_size()
+            global_loss /= world_size
+            global_accuracy /= world_size
+        else:
+            global_loss = torch.tensor(loss_mean)
+            global_accuracy = torch.tensor(acc)
+        return global_accuracy, global_loss
+    def load_best_model(self, to_ddp=True, from_ddp=True):
+        data_dir = f'{self.log_path}/exp{self.exp_num}'
+        # data_dir = f'{self.log_path}/exp29' # for debugging
+        state_dict_files = glob.glob(data_dir + '/*.pth')
+        print("loading model from ", state_dict_files[-1])
+        state_dict = torch.load(state_dict_files[-1]) if to_ddp else torch.load(state_dict_files[0],map_location=self.device)
+        if from_ddp:
+            print("loading distributed model")
+            # Remove "module." from keys
+            new_state_dict = OrderedDict()
+            for key, value in state_dict.items():
+                if key.startswith('module.'):
+                    while key.startswith('module.'):
+                        key = key[7:]
+                new_state_dict[key] = value
+            state_dict = new_state_dict
+        # print("state_dict: ", state_dict.keys())
+        # print("model: ", self.model.state_dict().keys())
+        self.model.load_state_dict(state_dict, strict=False)
+    def check_gradients(self):
+        for name, param in self.model.named_parameters():
+            if param.grad is not None:
+                grad_norm = param.grad.norm().item()
+                if grad_norm > 10:
+                    print(f"Large gradient in {name}: {grad_norm}")
+    def train_epoch(self, device, epoch):
+        """
+        Trains the model for one epoch.
+        """
+        if self.train_sampler is not None:
+            try:
+                self.train_sampler.set_epoch(epoch)
+            except AttributeError:
+                pass
+        self.model.train()
+        train_loss = []
+        train_acc = 0
+        total = 0
+        all_accs = torch.zeros(self.output_dim, device=device)
+        pbar = tqdm(self.train_dl)
+        for i, batch in enumerate(pbar):
+            if self.optimizer is not None:
+                self.optimizer.zero_grad()
+            loss, acc , y = self.train_batch(batch, i, device)
+            train_loss.append(loss.item())
+            all_accs = all_accs + acc
+            total += len(y)
+            pbar.set_description(f"train_acc: {acc}, train_loss:  {loss.item()}")
+            if i > self.max_iter:
+                break
+        print("number of train_accs: ", train_acc)
+        return train_loss, all_accs/total
+    def train_batch(self, batch, batch_idx, device):
+        x, fft, y = batch['audio']['array'], batch['audio']['fft'], batch['label']
+        x = x.to(device).float()
+        fft = fft.to(device).float()
+        y = y.to(device).float()
+        y_pred = self.model(fft)
+        loss = self.criterion(y_pred, y)
+        loss.backward()
+        self.optimizer.step()
+        if self.scheduler is not None:
+            self.scheduler.step()
+        # get predicted classes
+        probs = torch.sigmoid(y_pred)
+        cls_pred = (probs > 0.5).float()
+        acc = (cls_pred == y).sum()
+        return loss, acc, y
+    def eval_epoch(self, device, epoch):
+        """
+        Evaluates the model for one epoch.
+        """
+        self.model.eval()
+        val_loss = []
+        val_acc = 0
+        total = 0
+        all_accs = torch.zeros(self.output_dim, device=device)
+        pbar = tqdm(self.val_dl)
+        for i,batch in enumerate(pbar):
+            loss, acc, y = self.eval_batch(batch, i, device)
+            val_loss.append(loss.item())
+            all_accs = all_accs + acc
+            total += len(y)
+            pbar.set_description(f"val_acc: {acc}, val_loss:  {loss.item()}")
+            if i > self.max_iter:
+                break
+        return val_loss, all_accs/total
+    def eval_batch(self, batch, batch_idx, device):
+        x, fft, y = batch['audio']['array'], batch['audio']['fft'], batch['label']
+        x = x.to(device).float()
+        fft = fft.to(device).float()
+        y = y.to(device).float()
+        with torch.no_grad():
+            y_pred = self.model(fft)
+        loss = self.criterion(y_pred, y)
+        probs = torch.sigmoid(y_pred)
+        cls_pred = (probs > 0.5).float()
+        acc = (cls_pred == y).sum()
+        return loss, acc, y
+    def predict(self, test_dataloader, device):
+        """
+        Returns the predictions of the model on the given dataset.
+        """
+        self.model.eval()
+        total = 0
+        all_accs = 0
+        predictions = []
+        pbar = tqdm(self.val_dl)
+        for i,batch in enumerate(pbar):
+            x, fft, y = batch['audio']['array'], batch['audio']['fft'], batch['label']
+            x = x.to(device).float()
+            fft = fft.to(device).float()
+            y = y.to(device).float()
+            with torch.no_grad():
+                y_pred = self.model(fft)
+            loss = self.criterion(y_pred, y)
+            probs = torch.sigmoid(y_pred)
+            cls_pred = (probs > 0.5).float()
+            acc = (cls_pred == y).sum()
+            predictions.append(cls_pred)
+            all_accs += acc
+            total += len(y)
+        return predictions, all_accs/total