Chenghao-Qiu commited on Apr 9

Commit

2134860

verified ·

1 Parent(s): 47180e3

Upload folder using huggingface_hub

Browse files

Files changed (49) hide show

checkpoint-1000/config.json +37 -0
checkpoint-1000/configuration_hyena.py +88 -0
checkpoint-1000/modeling_hyena.py +574 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/pytorch_model.bin +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scaler.pt +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +51 -0
checkpoint-1000/tokenization_hyena.py +117 -0
checkpoint-1000/tokenizer_config.json +72 -0
checkpoint-1000/trainer_state.json +141 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-600/config.json +37 -0
checkpoint-600/configuration_hyena.py +88 -0
checkpoint-600/modeling_hyena.py +574 -0
checkpoint-600/optimizer.pt +3 -0
checkpoint-600/pytorch_model.bin +3 -0
checkpoint-600/rng_state.pth +3 -0
checkpoint-600/scaler.pt +3 -0
checkpoint-600/scheduler.pt +3 -0
checkpoint-600/special_tokens_map.json +51 -0
checkpoint-600/tokenization_hyena.py +117 -0
checkpoint-600/tokenizer_config.json +72 -0
checkpoint-600/trainer_state.json +91 -0
checkpoint-600/training_args.bin +3 -0
checkpoint-800/config.json +37 -0
checkpoint-800/configuration_hyena.py +88 -0
checkpoint-800/modeling_hyena.py +574 -0
checkpoint-800/optimizer.pt +3 -0
checkpoint-800/pytorch_model.bin +3 -0
checkpoint-800/rng_state.pth +3 -0
checkpoint-800/scaler.pt +3 -0
checkpoint-800/scheduler.pt +3 -0
checkpoint-800/special_tokens_map.json +51 -0
checkpoint-800/tokenization_hyena.py +117 -0
checkpoint-800/tokenizer_config.json +72 -0
checkpoint-800/trainer_state.json +116 -0
checkpoint-800/training_args.bin +3 -0
config.json +37 -0
configuration_hyena.py +88 -0
modeling_hyena.py +574 -0
optimizer_state_dict.pth +3 -0
pytorch_model.bin +3 -0
special_tokens_map.json +51 -0
tokenization_hyena.py +117 -0
tokenizer_config.json +72 -0
trainer_state.json +156 -0
training_args.bin +3 -0

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "activation_freq": 10,
+  "architectures": [
+    "HyenaDNAForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hyena.HyenaConfig",
+    "AutoModel": "modeling_hyena.HyenaDNAModel",
+    "AutoModelForCausalLM": "modeling_hyena.HyenaDNAForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_hyena.HyenaDNAForSequenceClassification"
+  },
+  "d_inner": 1024,
+  "d_model": 256,
+  "emb_dim": 5,
+  "embed_dropout": 0.1,
+  "filter_order": 64,
+  "hyena_dropout": 0.0,
+  "hyena_filter_dropout": 0.0,
+  "hyena_order": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_seq_len": 32770,
+  "model_type": "hyenadna",
+  "n_layer": 4,
+  "num_inner_mlps": 2,
+  "pad_token_id": 4,
+  "pad_vocab_size_multiple": 8,
+  "problem_type": "single_label_classification",
+  "short_filter_order": 3,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "train_freq": true,
+  "transformers_version": "4.26.1",
+  "use_bias": true,
+  "vocab_size": 12
+}

checkpoint-1000/configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from transformers import PretrainedConfig
+import json
+class HyenaConfig(PretrainedConfig):
+    model_type = "hyenadna"
+    def __init__(
+        self,
+        vocab_size=12,
+        d_model=256,
+        d_inner=None,
+        use_bias=True,
+        train_freq=True,
+        max_seq_len=1024,
+        emb_dim=3,
+        n_layer=12,
+        num_inner_mlps=2,
+        hyena_order=2,
+        short_filter_order=3,
+        filter_order=64,
+        activation_freq=1,
+        embed_dropout=0.1,
+        hyena_dropout=0.0,
+        hyena_filter_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        pad_vocab_size_multiple=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        if d_inner is None:
+            self.d_inner = 4 * d_model
+        else:
+            self.d_inner = d_inner
+        self.use_bias = use_bias
+        self.train_freq = train_freq
+        self.max_seq_len = max_seq_len
+        self.emb_dim = emb_dim
+        self.n_layer = n_layer
+        self.hyena_order = hyena_order
+        self.filter_order = filter_order
+        self.short_filter_order = short_filter_order
+        self.activation_freq = activation_freq
+        self.num_inner_mlps = num_inner_mlps
+        self.embed_dropout = embed_dropout
+        self.hyena_dropout = hyena_dropout
+        self.hyena_filter_dropout = hyena_filter_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        super().__init__(**kwargs)
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        vocab_size = config["vocab_size"]
+        d_model = config["d_model"]
+        d_inner = config["d_inner"]
+        max_seq_len = config["layer"]["l_max"]
+        emb_dim = config["layer"]["emb_dim"]
+        filter_order = config["layer"]["filter_order"]
+        if "local_order" in config["layer"]:
+            short_filter_order = config["layer"]["local_order"]
+        elif "short_filter_order" in config["layer"]:
+            short_filter_order = config["layer"]["short_filter_order"]
+        else:
+            short_filter_order = 3
+        n_layer = config["n_layer"]
+        activation_freq = config["layer"]["w"]
+        embed_dropout = config["embed_dropout"]
+        pad_vocab_size_multiple = config["pad_vocab_size_multiple"]
+        return cls(vocab_size=vocab_size,
+                   d_model=d_model,
+                   d_inner=d_inner,
+                   max_seq_len=max_seq_len,
+                   emb_dim=emb_dim,
+                   filter_order=filter_order,
+                   short_filter_order=short_filter_order,
+                   n_layer=n_layer,
+                   activation_freq=activation_freq,
+                   embed_dropout=embed_dropout,
+                   pad_vocab_size_multiple=pad_vocab_size_multiple,
+                   tie_word_embeddings=False,
+                   **kwargs
+                   )

checkpoint-1000/modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# -*- coding: utf-8 -*-
+"""HyenaDNA custom code port to Hugging Face Hub"""
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .configuration_hyena import HyenaConfig
+from transformers import PreTrainedModel
+from typing import Optional, Tuple, Union
+from transformers.modeling_outputs import CausalLMOutput, SequenceClassifierOutput, BaseModelOutputWithNoAttention
+def fftconv(u, k, D):
+    """
+    We apply a convolution through the fourier domain (from the Convolution Theorem)
+    """
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k.to(torch.float32), n=fft_size) / fft_size
+    u_f = torch.fft.rfft(u.to(dtype=torch.float32), n=fft_size)
+    if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
+    y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
+    out = y + u * D.unsqueeze(-1)
+    return out.to(dtype=u.dtype)
+@torch.jit.script
+def mul_sum(q, y):
+    return (q * y).sum(dim=1)
+class HyenaSin(nn.Module):
+    """The Sin activation function for the Hyena Filter function."""
+    def __init__(self, config):
+        super().__init__()
+        self.freq = nn.Parameter(config.activation_freq * torch.ones(1, config.filter_order)) if config.train_freq else config.activation_freq * torch.ones(1, config.filter_order)
+    def forward(self, x):
+        return torch.sin(self.freq * x)
+class HyenaPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+        self.seq_len = config.max_seq_len
+        # The time embedding fed to the filteres is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None] # 1, L, 1
+        if config.emb_dim > 1:
+            bands = (config.emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, self.seq_len - 1, self.seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.cat([t, torch.cos(-f * w), torch.sin(-f * w)], dim=-1)
+        self.register_buffer("z", z)
+        self.register_buffer("t", t)
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+class HyenaExponentialModulation(nn.Module):
+    """The window function applied to the output of the (MLP) filter function."""
+    def __init__(
+        self,
+        d_model,
+        fast_decay_pct=0.3,
+        slow_decay_pct=1.5,
+        target=1e-2,
+        modulate: bool=True,
+        shift: float = 0.05,
+        **kwargs
+    ):
+        super().__init__()
+        self.modulate = modulate
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        self.register_buffer("deltas", deltas)
+    def forward(self, t, x):
+        if self.modulate:
+            decay = torch.exp(-t * self.deltas.abs())
+            x = x * (decay + self.shift)
+        return x
+class HyenaFilter(nn.Module):
+    def __init__(
+            self,
+            config,
+            **kwargs
+        ):
+        """
+        Implicit long filter with modulation.
+        Args:
+            d_model: number of channels in the input
+            emb_dim: dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            order: width of the FFN
+            num_inner_mlps: number of inner linear layers inside filter MLP
+        Note:
+            filter_dropout is not implemented
+        """
+        super().__init__()
+        self.d_model = config.d_model * (config.hyena_order - 1)
+        self.use_bias = config.use_bias
+        self.bias = nn.Parameter(torch.randn(self.d_model))
+        self.dropout = nn.Dropout(config.hyena_filter_dropout)
+        act = HyenaSin(config)
+        self.emb_dim = config.emb_dim
+        assert self.emb_dim % 2 != 0 and self.emb_dim >= 3, "emb_dim must be odd and greater or equal to 3 (time, sine and cosine)"
+        self.seq_len = config.max_seq_len
+        self.pos_emb = HyenaPositionalEmbedding(config)
+        self.implicit_filter = nn.Sequential(
+            nn.Linear(self.emb_dim, config.filter_order),
+            act,
+        )
+        for i in range(config.num_inner_mlps):
+            self.implicit_filter.append(nn.Linear(config.filter_order, config.filter_order))
+            self.implicit_filter.append(act)
+        self.implicit_filter.append(nn.Linear(config.filter_order, config.d_model, bias=False))
+        self.modulation = HyenaExponentialModulation(config.d_model)
+        self.normalized = False
+    def filter(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter(z.to(dtype=self.implicit_filter[0].weight.dtype))
+        h = self.modulation(t, h)
+        return h
+    def forward(self, x, L, k=None, bias=None, *args, **kwargs):
+        if k is None: k = self.filter(L)
+        # Ensure compatibility with filters that return a tuple
+        k = k[0] if type(k) is tuple else k
+        y = fftconv(x, k, bias)
+        return y
+class HyenaOperator(nn.Module):
+    def __init__(
+            self,
+            config,
+            **filter_args,
+        ):
+        r"""
+        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
+        Args:
+            d_model (int): Dimension of the input and output embeddings (width of the layer)
+            l_max: (int): Maximum input sequence length. Defaults to None
+            order: (int): Depth of the Hyena recurrence. Defaults to 2
+            dropout: (float): Dropout probability. Defaults to 0.0
+            filter_dropout: (float): Dropout probability for the filter. Defaults to 0.0
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.l_max = config.max_seq_len
+        self.order = config.hyena_order
+        inner_width = config.d_model * (self.order + 1)
+        self.dropout = nn.Dropout(config.hyena_dropout)
+        self.in_proj = nn.Linear(self.d_model, inner_width)
+        self.out_proj = nn.Linear(self.d_model, self.d_model)
+        self.short_filter = nn.Conv1d(
+            inner_width,
+            inner_width,
+            config.short_filter_order,
+            padding=2,
+            groups=inner_width
+        )
+        self.filter_fn = HyenaFilter(config)
+    def forward(self, u):
+        l = u.size(-2)
+        l_filter = min(l, self.l_max)
+        u = self.in_proj(u).transpose(1, 2)
+        uc = self.short_filter(u)[...,:l_filter]
+        *x, v = uc.split(self.d_model, dim=1)
+        k = self.filter_fn.filter(l_filter)[0]
+        k = k.transpose(0, 1).reshape(self.order - 1, self.d_model, l_filter)
+        bias = self.filter_fn.bias.reshape(self.order - 1, self.d_model)
+        for o, x_i in enumerate(reversed(x[1:])):
+            v = self.dropout(v * x_i)
+            v = self.filter_fn(v, l_filter, k=k[o], bias=bias[o])
+        y = (v * x[0]).transpose(1, 2)
+        y = self.out_proj(y)
+        return y
+class HyenaMlp(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/mlp.py
+        """
+        super().__init__()
+        in_features = config.d_model
+        hidden_features = config.d_inner
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, config.d_model)
+    def forward(self, x):
+        y = self.fc1(x)
+        y = F.gelu(y, approximate="tanh")
+        y = self.fc2(y)
+        return y
+class HyenaBlock(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/block.py
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
+        the hidden_states (output of the MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
+        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.
+        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
+        This is for performance reason: for post-norm architecture, returning the input allows us
+        to fuse the backward of nn.Linear with the residual connection.
+        """
+        super().__init__()
+        self.mixer = HyenaOperator(config)
+        self.norm1 = nn.LayerNorm(config.d_model)
+        self.mlp = HyenaMlp(config)
+        self.norm2 = nn.LayerNorm(config.d_model)
+    def forward(self, hidden_states):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+        """
+        residual = hidden_states
+        residual = residual.to(torch.float32)
+        hyena_normed = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+        hidden_states = self.mixer(hyena_normed)
+        # Tested above here and all is equivalent. That means the mixer is fine!!!
+        residual = hidden_states + residual
+        hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+        residual = residual.to(torch.float32)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states + residual
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+class HyenaEmbeddings(nn.Module):
+    def __init__(self, config, padding_idx=None):
+        """
+            If max_position_embeddings <= 0, there's no position embeddings
+            If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
+                the project up to embed_dim
+        """
+        super().__init__()
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.word_embeddings = nn.Embedding(vocab_size, config.d_model, padding_idx=padding_idx)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+class HyenaLMBackbone(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        # note max_position_embeddings is 0 for Hyena, and therefore isn't used
+        self.embeddings = HyenaEmbeddings(config)
+        self.dropout = nn.Dropout(config.embed_dropout)
+        self.layers = nn.ModuleList([HyenaBlock(config) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        for layer in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(layer.__call__, hidden_states)
+            else:
+                hidden_states = layer(hidden_states)
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        hidden_states = self.ln_f(hidden_states.to(dtype=self.ln_f.weight.dtype))
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+class HyenaDNAPreTrainedModel(PreTrainedModel):
+    config_class = HyenaConfig
+    base_model_prefix = "hyena"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyenaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]  # Shared tensors that safetensors merges
+    def _init_weights(self, module, initializer_range=0.02):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in self.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+            # If using GLU activation for now, we scale the std by 2
+            elif name in ["output_linear.0.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+class HyenaDNAModel(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.backbone = HyenaLMBackbone(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(input_ids, inputs_embeds=inputs_embeds, output_hidden_states=output_hidden_states)
+        if return_dict:
+            return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states,
+                                                  hidden_states=all_hidden_states if output_hidden_states else None)
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.hyena = HyenaDNAModel(config)
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.vocab_size = vocab_size
+        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.hyena = decoder
+    def get_decoder(self):
+        return self.hyena
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.hyena(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.hyena = HyenaDNAModel(config)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.hyena(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ef7e54ba1b263d9091a3b54061ee508313c2e29a1b4fd6c4a456699aab93ff5
+size 26304517

checkpoint-1000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a980d905b63d647cd0ba855443b706a1ccbbe69153ae04b1954150e780ece36d
+size 16300157

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5773d772a88e414bc446a704f98e2303c1d836815da2d0e7d245a164163118d
+size 14575

checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46b7af93ae41ac3bdc5a501bc85a1111baad8d4df217a9776b7435044a9d320a
+size 557

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ebe2cff767413b62b8cb3e5d55cf4f3e637c678e963eb9059d55891b8da69e4
+size 627

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenization_hyena.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from transformers import PreTrainedTokenizer, AddedToken
+from typing import List, Optional, Union, Dict, Sequence, Tuple
+from pathlib import Path
+import json
+import os
+class HyenaDNATokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        # cls = [self.cls_token_id]
+        result = token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    # HyenaDNA has a fixed vocabulary with no vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        return ()

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hyena.HyenaDNATokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 256,
+  "name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/home/hlv8980/.cache/huggingface/hub/models--LongSafari--hyenadna-small-32k-seqlen-hf/snapshots/8fe770c78eb13fe33bf81501612faeddf4d6f331/special_tokens_map.json",
+  "tokenizer_class": "HyenaDNATokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+  "best_metric": 0.39216598868370056,
+  "best_model_checkpoint": "/scratch/hlv8980/Attack_Benchmark/models/hyena/tf4/origin/checkpoint-600",
+  "epoch": 3.3670033670033668,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.34,
+      "learning_rate": 2.8760984182776802e-05,
+      "loss": 0.5992,
+      "step": 100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 2.615114235500879e-05,
+      "loss": 0.4813,
+      "step": 200
+    },
+    {
+      "epoch": 0.67,
+      "eval_accuracy": 0.774,
+      "eval_f1": 0.7713328260834371,
+      "eval_loss": 0.48207539319992065,
+      "eval_matthews_correlation": 0.5579338694412199,
+      "eval_precision": 0.785067107786007,
+      "eval_recall": 0.772997299729973,
+      "eval_runtime": 0.1057,
+      "eval_samples_per_second": 9462.679,
+      "eval_steps_per_second": 151.403,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 2.3514938488576452e-05,
+      "loss": 0.4431,
+      "step": 300
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 2.087873462214411e-05,
+      "loss": 0.377,
+      "step": 400
+    },
+    {
+      "epoch": 1.35,
+      "eval_accuracy": 0.816,
+      "eval_f1": 0.8159933757615274,
+      "eval_loss": 0.427643358707428,
+      "eval_matthews_correlation": 0.6320128653971173,
+      "eval_precision": 0.815991263965056,
+      "eval_recall": 0.816021602160216,
+      "eval_runtime": 0.1039,
+      "eval_samples_per_second": 9625.637,
+      "eval_steps_per_second": 154.01,
+      "step": 400
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 1.82688927943761e-05,
+      "loss": 0.3443,
+      "step": 500
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 1.563268892794376e-05,
+      "loss": 0.33,
+      "step": 600
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.824,
+      "eval_f1": 0.8239746523499383,
+      "eval_loss": 0.39216598868370056,
+      "eval_matthews_correlation": 0.6479558982194922,
+      "eval_precision": 0.8239935027265344,
+      "eval_recall": 0.8239623962396239,
+      "eval_runtime": 0.1031,
+      "eval_samples_per_second": 9696.512,
+      "eval_steps_per_second": 155.144,
+      "step": 600
+    },
+    {
+      "epoch": 2.36,
+      "learning_rate": 1.2996485061511423e-05,
+      "loss": 0.227,
+      "step": 700
+    },
+    {
+      "epoch": 2.69,
+      "learning_rate": 1.0360281195079087e-05,
+      "loss": 0.2219,
+      "step": 800
+    },
+    {
+      "epoch": 2.69,
+      "eval_accuracy": 0.838,
+      "eval_f1": 0.8379766686402841,
+      "eval_loss": 0.4026987850666046,
+      "eval_matthews_correlation": 0.6767651028795362,
+      "eval_precision": 0.8385613769517563,
+      "eval_recall": 0.8382038203820381,
+      "eval_runtime": 0.1026,
+      "eval_samples_per_second": 9746.534,
+      "eval_steps_per_second": 155.945,
+      "step": 800
+    },
+    {
+      "epoch": 3.03,
+      "learning_rate": 7.724077328646749e-06,
+      "loss": 0.2121,
+      "step": 900
+    },
+    {
+      "epoch": 3.37,
+      "learning_rate": 5.087873462214412e-06,
+      "loss": 0.1388,
+      "step": 1000
+    },
+    {
+      "epoch": 3.37,
+      "eval_accuracy": 0.857,
+      "eval_f1": 0.8566558306493891,
+      "eval_loss": 0.393052339553833,
+      "eval_matthews_correlation": 0.7159331394438886,
+      "eval_precision": 0.859342750257998,
+      "eval_recall": 0.8565956595659566,
+      "eval_runtime": 0.1036,
+      "eval_samples_per_second": 9656.574,
+      "eval_steps_per_second": 154.505,
+      "step": 1000
+    }
+  ],
+  "max_steps": 1188,
+  "num_train_epochs": 4,
+  "total_flos": 128187317035008.0,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4e3a85efd6ca1a2228fc0bf6f5ca43150a2981352327acbf61aa5be7e43d49
+size 3707

checkpoint-600/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "activation_freq": 10,
+  "architectures": [
+    "HyenaDNAForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hyena.HyenaConfig",
+    "AutoModel": "modeling_hyena.HyenaDNAModel",
+    "AutoModelForCausalLM": "modeling_hyena.HyenaDNAForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_hyena.HyenaDNAForSequenceClassification"
+  },
+  "d_inner": 1024,
+  "d_model": 256,
+  "emb_dim": 5,
+  "embed_dropout": 0.1,
+  "filter_order": 64,
+  "hyena_dropout": 0.0,
+  "hyena_filter_dropout": 0.0,
+  "hyena_order": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_seq_len": 32770,
+  "model_type": "hyenadna",
+  "n_layer": 4,
+  "num_inner_mlps": 2,
+  "pad_token_id": 4,
+  "pad_vocab_size_multiple": 8,
+  "problem_type": "single_label_classification",
+  "short_filter_order": 3,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "train_freq": true,
+  "transformers_version": "4.26.1",
+  "use_bias": true,
+  "vocab_size": 12
+}

checkpoint-600/configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from transformers import PretrainedConfig
+import json
+class HyenaConfig(PretrainedConfig):
+    model_type = "hyenadna"
+    def __init__(
+        self,
+        vocab_size=12,
+        d_model=256,
+        d_inner=None,
+        use_bias=True,
+        train_freq=True,
+        max_seq_len=1024,
+        emb_dim=3,
+        n_layer=12,
+        num_inner_mlps=2,
+        hyena_order=2,
+        short_filter_order=3,
+        filter_order=64,
+        activation_freq=1,
+        embed_dropout=0.1,
+        hyena_dropout=0.0,
+        hyena_filter_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        pad_vocab_size_multiple=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        if d_inner is None:
+            self.d_inner = 4 * d_model
+        else:
+            self.d_inner = d_inner
+        self.use_bias = use_bias
+        self.train_freq = train_freq
+        self.max_seq_len = max_seq_len
+        self.emb_dim = emb_dim
+        self.n_layer = n_layer
+        self.hyena_order = hyena_order
+        self.filter_order = filter_order
+        self.short_filter_order = short_filter_order
+        self.activation_freq = activation_freq
+        self.num_inner_mlps = num_inner_mlps
+        self.embed_dropout = embed_dropout
+        self.hyena_dropout = hyena_dropout
+        self.hyena_filter_dropout = hyena_filter_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        super().__init__(**kwargs)
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        vocab_size = config["vocab_size"]
+        d_model = config["d_model"]
+        d_inner = config["d_inner"]
+        max_seq_len = config["layer"]["l_max"]
+        emb_dim = config["layer"]["emb_dim"]
+        filter_order = config["layer"]["filter_order"]
+        if "local_order" in config["layer"]:
+            short_filter_order = config["layer"]["local_order"]
+        elif "short_filter_order" in config["layer"]:
+            short_filter_order = config["layer"]["short_filter_order"]
+        else:
+            short_filter_order = 3
+        n_layer = config["n_layer"]
+        activation_freq = config["layer"]["w"]
+        embed_dropout = config["embed_dropout"]
+        pad_vocab_size_multiple = config["pad_vocab_size_multiple"]
+        return cls(vocab_size=vocab_size,
+                   d_model=d_model,
+                   d_inner=d_inner,
+                   max_seq_len=max_seq_len,
+                   emb_dim=emb_dim,
+                   filter_order=filter_order,
+                   short_filter_order=short_filter_order,
+                   n_layer=n_layer,
+                   activation_freq=activation_freq,
+                   embed_dropout=embed_dropout,
+                   pad_vocab_size_multiple=pad_vocab_size_multiple,
+                   tie_word_embeddings=False,
+                   **kwargs
+                   )

checkpoint-600/modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# -*- coding: utf-8 -*-
+"""HyenaDNA custom code port to Hugging Face Hub"""
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .configuration_hyena import HyenaConfig
+from transformers import PreTrainedModel
+from typing import Optional, Tuple, Union
+from transformers.modeling_outputs import CausalLMOutput, SequenceClassifierOutput, BaseModelOutputWithNoAttention
+def fftconv(u, k, D):
+    """
+    We apply a convolution through the fourier domain (from the Convolution Theorem)
+    """
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k.to(torch.float32), n=fft_size) / fft_size
+    u_f = torch.fft.rfft(u.to(dtype=torch.float32), n=fft_size)
+    if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
+    y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
+    out = y + u * D.unsqueeze(-1)
+    return out.to(dtype=u.dtype)
+@torch.jit.script
+def mul_sum(q, y):
+    return (q * y).sum(dim=1)
+class HyenaSin(nn.Module):
+    """The Sin activation function for the Hyena Filter function."""
+    def __init__(self, config):
+        super().__init__()
+        self.freq = nn.Parameter(config.activation_freq * torch.ones(1, config.filter_order)) if config.train_freq else config.activation_freq * torch.ones(1, config.filter_order)
+    def forward(self, x):
+        return torch.sin(self.freq * x)
+class HyenaPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+        self.seq_len = config.max_seq_len
+        # The time embedding fed to the filteres is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None] # 1, L, 1
+        if config.emb_dim > 1:
+            bands = (config.emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, self.seq_len - 1, self.seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.cat([t, torch.cos(-f * w), torch.sin(-f * w)], dim=-1)
+        self.register_buffer("z", z)
+        self.register_buffer("t", t)
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+class HyenaExponentialModulation(nn.Module):
+    """The window function applied to the output of the (MLP) filter function."""
+    def __init__(
+        self,
+        d_model,
+        fast_decay_pct=0.3,
+        slow_decay_pct=1.5,
+        target=1e-2,
+        modulate: bool=True,
+        shift: float = 0.05,
+        **kwargs
+    ):
+        super().__init__()
+        self.modulate = modulate
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        self.register_buffer("deltas", deltas)
+    def forward(self, t, x):
+        if self.modulate:
+            decay = torch.exp(-t * self.deltas.abs())
+            x = x * (decay + self.shift)
+        return x
+class HyenaFilter(nn.Module):
+    def __init__(
+            self,
+            config,
+            **kwargs
+        ):
+        """
+        Implicit long filter with modulation.
+        Args:
+            d_model: number of channels in the input
+            emb_dim: dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            order: width of the FFN
+            num_inner_mlps: number of inner linear layers inside filter MLP
+        Note:
+            filter_dropout is not implemented
+        """
+        super().__init__()
+        self.d_model = config.d_model * (config.hyena_order - 1)
+        self.use_bias = config.use_bias
+        self.bias = nn.Parameter(torch.randn(self.d_model))
+        self.dropout = nn.Dropout(config.hyena_filter_dropout)
+        act = HyenaSin(config)
+        self.emb_dim = config.emb_dim
+        assert self.emb_dim % 2 != 0 and self.emb_dim >= 3, "emb_dim must be odd and greater or equal to 3 (time, sine and cosine)"
+        self.seq_len = config.max_seq_len
+        self.pos_emb = HyenaPositionalEmbedding(config)
+        self.implicit_filter = nn.Sequential(
+            nn.Linear(self.emb_dim, config.filter_order),
+            act,
+        )
+        for i in range(config.num_inner_mlps):
+            self.implicit_filter.append(nn.Linear(config.filter_order, config.filter_order))
+            self.implicit_filter.append(act)
+        self.implicit_filter.append(nn.Linear(config.filter_order, config.d_model, bias=False))
+        self.modulation = HyenaExponentialModulation(config.d_model)
+        self.normalized = False
+    def filter(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter(z.to(dtype=self.implicit_filter[0].weight.dtype))
+        h = self.modulation(t, h)
+        return h
+    def forward(self, x, L, k=None, bias=None, *args, **kwargs):
+        if k is None: k = self.filter(L)
+        # Ensure compatibility with filters that return a tuple
+        k = k[0] if type(k) is tuple else k
+        y = fftconv(x, k, bias)
+        return y
+class HyenaOperator(nn.Module):
+    def __init__(
+            self,
+            config,
+            **filter_args,
+        ):
+        r"""
+        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
+        Args:
+            d_model (int): Dimension of the input and output embeddings (width of the layer)
+            l_max: (int): Maximum input sequence length. Defaults to None
+            order: (int): Depth of the Hyena recurrence. Defaults to 2
+            dropout: (float): Dropout probability. Defaults to 0.0
+            filter_dropout: (float): Dropout probability for the filter. Defaults to 0.0
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.l_max = config.max_seq_len
+        self.order = config.hyena_order
+        inner_width = config.d_model * (self.order + 1)
+        self.dropout = nn.Dropout(config.hyena_dropout)
+        self.in_proj = nn.Linear(self.d_model, inner_width)
+        self.out_proj = nn.Linear(self.d_model, self.d_model)
+        self.short_filter = nn.Conv1d(
+            inner_width,
+            inner_width,
+            config.short_filter_order,
+            padding=2,
+            groups=inner_width
+        )
+        self.filter_fn = HyenaFilter(config)
+    def forward(self, u):
+        l = u.size(-2)
+        l_filter = min(l, self.l_max)
+        u = self.in_proj(u).transpose(1, 2)
+        uc = self.short_filter(u)[...,:l_filter]
+        *x, v = uc.split(self.d_model, dim=1)
+        k = self.filter_fn.filter(l_filter)[0]
+        k = k.transpose(0, 1).reshape(self.order - 1, self.d_model, l_filter)
+        bias = self.filter_fn.bias.reshape(self.order - 1, self.d_model)
+        for o, x_i in enumerate(reversed(x[1:])):
+            v = self.dropout(v * x_i)
+            v = self.filter_fn(v, l_filter, k=k[o], bias=bias[o])
+        y = (v * x[0]).transpose(1, 2)
+        y = self.out_proj(y)
+        return y
+class HyenaMlp(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/mlp.py
+        """
+        super().__init__()
+        in_features = config.d_model
+        hidden_features = config.d_inner
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, config.d_model)
+    def forward(self, x):
+        y = self.fc1(x)
+        y = F.gelu(y, approximate="tanh")
+        y = self.fc2(y)
+        return y
+class HyenaBlock(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/block.py
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
+        the hidden_states (output of the MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
+        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.
+        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
+        This is for performance reason: for post-norm architecture, returning the input allows us
+        to fuse the backward of nn.Linear with the residual connection.
+        """
+        super().__init__()
+        self.mixer = HyenaOperator(config)
+        self.norm1 = nn.LayerNorm(config.d_model)
+        self.mlp = HyenaMlp(config)
+        self.norm2 = nn.LayerNorm(config.d_model)
+    def forward(self, hidden_states):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+        """
+        residual = hidden_states
+        residual = residual.to(torch.float32)
+        hyena_normed = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+        hidden_states = self.mixer(hyena_normed)
+        # Tested above here and all is equivalent. That means the mixer is fine!!!
+        residual = hidden_states + residual
+        hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+        residual = residual.to(torch.float32)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states + residual
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+class HyenaEmbeddings(nn.Module):
+    def __init__(self, config, padding_idx=None):
+        """
+            If max_position_embeddings <= 0, there's no position embeddings
+            If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
+                the project up to embed_dim
+        """
+        super().__init__()
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.word_embeddings = nn.Embedding(vocab_size, config.d_model, padding_idx=padding_idx)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+class HyenaLMBackbone(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        # note max_position_embeddings is 0 for Hyena, and therefore isn't used
+        self.embeddings = HyenaEmbeddings(config)
+        self.dropout = nn.Dropout(config.embed_dropout)
+        self.layers = nn.ModuleList([HyenaBlock(config) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        for layer in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(layer.__call__, hidden_states)
+            else:
+                hidden_states = layer(hidden_states)
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        hidden_states = self.ln_f(hidden_states.to(dtype=self.ln_f.weight.dtype))
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+class HyenaDNAPreTrainedModel(PreTrainedModel):
+    config_class = HyenaConfig
+    base_model_prefix = "hyena"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyenaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]  # Shared tensors that safetensors merges
+    def _init_weights(self, module, initializer_range=0.02):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in self.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+            # If using GLU activation for now, we scale the std by 2
+            elif name in ["output_linear.0.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+class HyenaDNAModel(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.backbone = HyenaLMBackbone(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(input_ids, inputs_embeds=inputs_embeds, output_hidden_states=output_hidden_states)
+        if return_dict:
+            return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states,
+                                                  hidden_states=all_hidden_states if output_hidden_states else None)
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.hyena = HyenaDNAModel(config)
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.vocab_size = vocab_size
+        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.hyena = decoder
+    def get_decoder(self):
+        return self.hyena
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.hyena(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.hyena = HyenaDNAModel(config)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.hyena(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

checkpoint-600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275d48d3b7464b79d184fe1244342cca6f3d8e813f5d8f03461334b719d4c105
+size 26304517

checkpoint-600/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dbaff97807d7961eb4208709c13b67c9b7fb92e5f7418a96202fef0ae7e5dd5
+size 16300157

checkpoint-600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cde37b1d59ec8275aa116b67c6503ca997bd9a90886b106af365cbb8dfef4db9
+size 14575

checkpoint-600/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d21d167def712bc3f600520b89e654492b844aa18ab3c37e7c0d3d698a2a65b5
+size 557

checkpoint-600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af037f192434655ed417b8694a7888adca54958a4cd3e44898830064a0a7c1ee
+size 627

checkpoint-600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-600/tokenization_hyena.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from transformers import PreTrainedTokenizer, AddedToken
+from typing import List, Optional, Union, Dict, Sequence, Tuple
+from pathlib import Path
+import json
+import os
+class HyenaDNATokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        # cls = [self.cls_token_id]
+        result = token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    # HyenaDNA has a fixed vocabulary with no vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        return ()

checkpoint-600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hyena.HyenaDNATokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 256,
+  "name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/home/hlv8980/.cache/huggingface/hub/models--LongSafari--hyenadna-small-32k-seqlen-hf/snapshots/8fe770c78eb13fe33bf81501612faeddf4d6f331/special_tokens_map.json",
+  "tokenizer_class": "HyenaDNATokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "best_metric": 0.39216598868370056,
+  "best_model_checkpoint": "/scratch/hlv8980/Attack_Benchmark/models/hyena/tf4/origin/checkpoint-600",
+  "epoch": 2.0202020202020203,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.34,
+      "learning_rate": 2.8760984182776802e-05,
+      "loss": 0.5992,
+      "step": 100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 2.615114235500879e-05,
+      "loss": 0.4813,
+      "step": 200
+    },
+    {
+      "epoch": 0.67,
+      "eval_accuracy": 0.774,
+      "eval_f1": 0.7713328260834371,
+      "eval_loss": 0.48207539319992065,
+      "eval_matthews_correlation": 0.5579338694412199,
+      "eval_precision": 0.785067107786007,
+      "eval_recall": 0.772997299729973,
+      "eval_runtime": 0.1057,
+      "eval_samples_per_second": 9462.679,
+      "eval_steps_per_second": 151.403,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 2.3514938488576452e-05,
+      "loss": 0.4431,
+      "step": 300
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 2.087873462214411e-05,
+      "loss": 0.377,
+      "step": 400
+    },
+    {
+      "epoch": 1.35,
+      "eval_accuracy": 0.816,
+      "eval_f1": 0.8159933757615274,
+      "eval_loss": 0.427643358707428,
+      "eval_matthews_correlation": 0.6320128653971173,
+      "eval_precision": 0.815991263965056,
+      "eval_recall": 0.816021602160216,
+      "eval_runtime": 0.1039,
+      "eval_samples_per_second": 9625.637,
+      "eval_steps_per_second": 154.01,
+      "step": 400
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 1.82688927943761e-05,
+      "loss": 0.3443,
+      "step": 500
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 1.563268892794376e-05,
+      "loss": 0.33,
+      "step": 600
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.824,
+      "eval_f1": 0.8239746523499383,
+      "eval_loss": 0.39216598868370056,
+      "eval_matthews_correlation": 0.6479558982194922,
+      "eval_precision": 0.8239935027265344,
+      "eval_recall": 0.8239623962396239,
+      "eval_runtime": 0.1031,
+      "eval_samples_per_second": 9696.512,
+      "eval_steps_per_second": 155.144,
+      "step": 600
+    }
+  ],
+  "max_steps": 1188,
+  "num_train_epochs": 4,
+  "total_flos": 76909184335872.0,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4e3a85efd6ca1a2228fc0bf6f5ca43150a2981352327acbf61aa5be7e43d49
+size 3707

checkpoint-800/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "activation_freq": 10,
+  "architectures": [
+    "HyenaDNAForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hyena.HyenaConfig",
+    "AutoModel": "modeling_hyena.HyenaDNAModel",
+    "AutoModelForCausalLM": "modeling_hyena.HyenaDNAForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_hyena.HyenaDNAForSequenceClassification"
+  },
+  "d_inner": 1024,
+  "d_model": 256,
+  "emb_dim": 5,
+  "embed_dropout": 0.1,
+  "filter_order": 64,
+  "hyena_dropout": 0.0,
+  "hyena_filter_dropout": 0.0,
+  "hyena_order": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_seq_len": 32770,
+  "model_type": "hyenadna",
+  "n_layer": 4,
+  "num_inner_mlps": 2,
+  "pad_token_id": 4,
+  "pad_vocab_size_multiple": 8,
+  "problem_type": "single_label_classification",
+  "short_filter_order": 3,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "train_freq": true,
+  "transformers_version": "4.26.1",
+  "use_bias": true,
+  "vocab_size": 12
+}

checkpoint-800/configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from transformers import PretrainedConfig
+import json
+class HyenaConfig(PretrainedConfig):
+    model_type = "hyenadna"
+    def __init__(
+        self,
+        vocab_size=12,
+        d_model=256,
+        d_inner=None,
+        use_bias=True,
+        train_freq=True,
+        max_seq_len=1024,
+        emb_dim=3,
+        n_layer=12,
+        num_inner_mlps=2,
+        hyena_order=2,
+        short_filter_order=3,
+        filter_order=64,
+        activation_freq=1,
+        embed_dropout=0.1,
+        hyena_dropout=0.0,
+        hyena_filter_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        pad_vocab_size_multiple=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        if d_inner is None:
+            self.d_inner = 4 * d_model
+        else:
+            self.d_inner = d_inner
+        self.use_bias = use_bias
+        self.train_freq = train_freq
+        self.max_seq_len = max_seq_len
+        self.emb_dim = emb_dim
+        self.n_layer = n_layer
+        self.hyena_order = hyena_order
+        self.filter_order = filter_order
+        self.short_filter_order = short_filter_order
+        self.activation_freq = activation_freq
+        self.num_inner_mlps = num_inner_mlps
+        self.embed_dropout = embed_dropout
+        self.hyena_dropout = hyena_dropout
+        self.hyena_filter_dropout = hyena_filter_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        super().__init__(**kwargs)
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        vocab_size = config["vocab_size"]
+        d_model = config["d_model"]
+        d_inner = config["d_inner"]
+        max_seq_len = config["layer"]["l_max"]
+        emb_dim = config["layer"]["emb_dim"]
+        filter_order = config["layer"]["filter_order"]
+        if "local_order" in config["layer"]:
+            short_filter_order = config["layer"]["local_order"]
+        elif "short_filter_order" in config["layer"]:
+            short_filter_order = config["layer"]["short_filter_order"]
+        else:
+            short_filter_order = 3
+        n_layer = config["n_layer"]
+        activation_freq = config["layer"]["w"]
+        embed_dropout = config["embed_dropout"]
+        pad_vocab_size_multiple = config["pad_vocab_size_multiple"]
+        return cls(vocab_size=vocab_size,
+                   d_model=d_model,
+                   d_inner=d_inner,
+                   max_seq_len=max_seq_len,
+                   emb_dim=emb_dim,
+                   filter_order=filter_order,
+                   short_filter_order=short_filter_order,
+                   n_layer=n_layer,
+                   activation_freq=activation_freq,
+                   embed_dropout=embed_dropout,
+                   pad_vocab_size_multiple=pad_vocab_size_multiple,
+                   tie_word_embeddings=False,
+                   **kwargs
+                   )

checkpoint-800/modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# -*- coding: utf-8 -*-
+"""HyenaDNA custom code port to Hugging Face Hub"""
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .configuration_hyena import HyenaConfig
+from transformers import PreTrainedModel
+from typing import Optional, Tuple, Union
+from transformers.modeling_outputs import CausalLMOutput, SequenceClassifierOutput, BaseModelOutputWithNoAttention
+def fftconv(u, k, D):
+    """
+    We apply a convolution through the fourier domain (from the Convolution Theorem)
+    """
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k.to(torch.float32), n=fft_size) / fft_size
+    u_f = torch.fft.rfft(u.to(dtype=torch.float32), n=fft_size)
+    if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
+    y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
+    out = y + u * D.unsqueeze(-1)
+    return out.to(dtype=u.dtype)
+@torch.jit.script
+def mul_sum(q, y):
+    return (q * y).sum(dim=1)
+class HyenaSin(nn.Module):
+    """The Sin activation function for the Hyena Filter function."""
+    def __init__(self, config):
+        super().__init__()
+        self.freq = nn.Parameter(config.activation_freq * torch.ones(1, config.filter_order)) if config.train_freq else config.activation_freq * torch.ones(1, config.filter_order)
+    def forward(self, x):
+        return torch.sin(self.freq * x)
+class HyenaPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+        self.seq_len = config.max_seq_len
+        # The time embedding fed to the filteres is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None] # 1, L, 1
+        if config.emb_dim > 1:
+            bands = (config.emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, self.seq_len - 1, self.seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.cat([t, torch.cos(-f * w), torch.sin(-f * w)], dim=-1)
+        self.register_buffer("z", z)
+        self.register_buffer("t", t)
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+class HyenaExponentialModulation(nn.Module):
+    """The window function applied to the output of the (MLP) filter function."""
+    def __init__(
+        self,
+        d_model,
+        fast_decay_pct=0.3,
+        slow_decay_pct=1.5,
+        target=1e-2,
+        modulate: bool=True,
+        shift: float = 0.05,
+        **kwargs
+    ):
+        super().__init__()
+        self.modulate = modulate
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        self.register_buffer("deltas", deltas)
+    def forward(self, t, x):
+        if self.modulate:
+            decay = torch.exp(-t * self.deltas.abs())
+            x = x * (decay + self.shift)
+        return x
+class HyenaFilter(nn.Module):
+    def __init__(
+            self,
+            config,
+            **kwargs
+        ):
+        """
+        Implicit long filter with modulation.
+        Args:
+            d_model: number of channels in the input
+            emb_dim: dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            order: width of the FFN
+            num_inner_mlps: number of inner linear layers inside filter MLP
+        Note:
+            filter_dropout is not implemented
+        """
+        super().__init__()
+        self.d_model = config.d_model * (config.hyena_order - 1)
+        self.use_bias = config.use_bias
+        self.bias = nn.Parameter(torch.randn(self.d_model))
+        self.dropout = nn.Dropout(config.hyena_filter_dropout)
+        act = HyenaSin(config)
+        self.emb_dim = config.emb_dim
+        assert self.emb_dim % 2 != 0 and self.emb_dim >= 3, "emb_dim must be odd and greater or equal to 3 (time, sine and cosine)"
+        self.seq_len = config.max_seq_len
+        self.pos_emb = HyenaPositionalEmbedding(config)
+        self.implicit_filter = nn.Sequential(
+            nn.Linear(self.emb_dim, config.filter_order),
+            act,
+        )
+        for i in range(config.num_inner_mlps):
+            self.implicit_filter.append(nn.Linear(config.filter_order, config.filter_order))
+            self.implicit_filter.append(act)
+        self.implicit_filter.append(nn.Linear(config.filter_order, config.d_model, bias=False))
+        self.modulation = HyenaExponentialModulation(config.d_model)
+        self.normalized = False
+    def filter(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter(z.to(dtype=self.implicit_filter[0].weight.dtype))
+        h = self.modulation(t, h)
+        return h
+    def forward(self, x, L, k=None, bias=None, *args, **kwargs):
+        if k is None: k = self.filter(L)
+        # Ensure compatibility with filters that return a tuple
+        k = k[0] if type(k) is tuple else k
+        y = fftconv(x, k, bias)
+        return y
+class HyenaOperator(nn.Module):
+    def __init__(
+            self,
+            config,
+            **filter_args,
+        ):
+        r"""
+        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
+        Args:
+            d_model (int): Dimension of the input and output embeddings (width of the layer)
+            l_max: (int): Maximum input sequence length. Defaults to None
+            order: (int): Depth of the Hyena recurrence. Defaults to 2
+            dropout: (float): Dropout probability. Defaults to 0.0
+            filter_dropout: (float): Dropout probability for the filter. Defaults to 0.0
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.l_max = config.max_seq_len
+        self.order = config.hyena_order
+        inner_width = config.d_model * (self.order + 1)
+        self.dropout = nn.Dropout(config.hyena_dropout)
+        self.in_proj = nn.Linear(self.d_model, inner_width)
+        self.out_proj = nn.Linear(self.d_model, self.d_model)
+        self.short_filter = nn.Conv1d(
+            inner_width,
+            inner_width,
+            config.short_filter_order,
+            padding=2,
+            groups=inner_width
+        )
+        self.filter_fn = HyenaFilter(config)
+    def forward(self, u):
+        l = u.size(-2)
+        l_filter = min(l, self.l_max)
+        u = self.in_proj(u).transpose(1, 2)
+        uc = self.short_filter(u)[...,:l_filter]
+        *x, v = uc.split(self.d_model, dim=1)
+        k = self.filter_fn.filter(l_filter)[0]
+        k = k.transpose(0, 1).reshape(self.order - 1, self.d_model, l_filter)
+        bias = self.filter_fn.bias.reshape(self.order - 1, self.d_model)
+        for o, x_i in enumerate(reversed(x[1:])):
+            v = self.dropout(v * x_i)
+            v = self.filter_fn(v, l_filter, k=k[o], bias=bias[o])
+        y = (v * x[0]).transpose(1, 2)
+        y = self.out_proj(y)
+        return y
+class HyenaMlp(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/mlp.py
+        """
+        super().__init__()
+        in_features = config.d_model
+        hidden_features = config.d_inner
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, config.d_model)
+    def forward(self, x):
+        y = self.fc1(x)
+        y = F.gelu(y, approximate="tanh")
+        y = self.fc2(y)
+        return y
+class HyenaBlock(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/block.py
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
+        the hidden_states (output of the MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
+        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.
+        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
+        This is for performance reason: for post-norm architecture, returning the input allows us
+        to fuse the backward of nn.Linear with the residual connection.
+        """
+        super().__init__()
+        self.mixer = HyenaOperator(config)
+        self.norm1 = nn.LayerNorm(config.d_model)
+        self.mlp = HyenaMlp(config)
+        self.norm2 = nn.LayerNorm(config.d_model)
+    def forward(self, hidden_states):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+        """
+        residual = hidden_states
+        residual = residual.to(torch.float32)
+        hyena_normed = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+        hidden_states = self.mixer(hyena_normed)
+        # Tested above here and all is equivalent. That means the mixer is fine!!!
+        residual = hidden_states + residual
+        hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+        residual = residual.to(torch.float32)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states + residual
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+class HyenaEmbeddings(nn.Module):
+    def __init__(self, config, padding_idx=None):
+        """
+            If max_position_embeddings <= 0, there's no position embeddings
+            If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
+                the project up to embed_dim
+        """
+        super().__init__()
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.word_embeddings = nn.Embedding(vocab_size, config.d_model, padding_idx=padding_idx)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+class HyenaLMBackbone(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        # note max_position_embeddings is 0 for Hyena, and therefore isn't used
+        self.embeddings = HyenaEmbeddings(config)
+        self.dropout = nn.Dropout(config.embed_dropout)
+        self.layers = nn.ModuleList([HyenaBlock(config) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        for layer in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(layer.__call__, hidden_states)
+            else:
+                hidden_states = layer(hidden_states)
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        hidden_states = self.ln_f(hidden_states.to(dtype=self.ln_f.weight.dtype))
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+class HyenaDNAPreTrainedModel(PreTrainedModel):
+    config_class = HyenaConfig
+    base_model_prefix = "hyena"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyenaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]  # Shared tensors that safetensors merges
+    def _init_weights(self, module, initializer_range=0.02):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in self.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+            # If using GLU activation for now, we scale the std by 2
+            elif name in ["output_linear.0.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+class HyenaDNAModel(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.backbone = HyenaLMBackbone(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(input_ids, inputs_embeds=inputs_embeds, output_hidden_states=output_hidden_states)
+        if return_dict:
+            return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states,
+                                                  hidden_states=all_hidden_states if output_hidden_states else None)
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.hyena = HyenaDNAModel(config)
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.vocab_size = vocab_size
+        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.hyena = decoder
+    def get_decoder(self):
+        return self.hyena
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.hyena(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.hyena = HyenaDNAModel(config)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.hyena(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

checkpoint-800/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8763978a2444824f5a38c3580aba497e275fdda9db740c2f56f67645d5be8636
+size 26304517

checkpoint-800/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:299e2c7579e7fa27e18571ea2c3b4590da8d13bf1e459e0bb5e600e6d1482acd
+size 16300157

checkpoint-800/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae46c9fc2e6261b72f159e1aea97f31830fdd07cc60689547b223b43e934178
+size 14575

checkpoint-800/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:639595adfe39430588607a0328b8b87bd7572031124a42ca29c480181cdf81a1
+size 557

checkpoint-800/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dac5d5b84a5c940b5d42dcc0f70b744b0e74ac1348849e3191a87cec9e5c4661
+size 627

checkpoint-800/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-800/tokenization_hyena.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from transformers import PreTrainedTokenizer, AddedToken
+from typing import List, Optional, Union, Dict, Sequence, Tuple
+from pathlib import Path
+import json
+import os
+class HyenaDNATokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        # cls = [self.cls_token_id]
+        result = token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    # HyenaDNA has a fixed vocabulary with no vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        return ()

checkpoint-800/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hyena.HyenaDNATokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 256,
+  "name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/home/hlv8980/.cache/huggingface/hub/models--LongSafari--hyenadna-small-32k-seqlen-hf/snapshots/8fe770c78eb13fe33bf81501612faeddf4d6f331/special_tokens_map.json",
+  "tokenizer_class": "HyenaDNATokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-800/trainer_state.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+  "best_metric": 0.39216598868370056,
+  "best_model_checkpoint": "/scratch/hlv8980/Attack_Benchmark/models/hyena/tf4/origin/checkpoint-600",
+  "epoch": 2.6936026936026938,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.34,
+      "learning_rate": 2.8760984182776802e-05,
+      "loss": 0.5992,
+      "step": 100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 2.615114235500879e-05,
+      "loss": 0.4813,
+      "step": 200
+    },
+    {
+      "epoch": 0.67,
+      "eval_accuracy": 0.774,
+      "eval_f1": 0.7713328260834371,
+      "eval_loss": 0.48207539319992065,
+      "eval_matthews_correlation": 0.5579338694412199,
+      "eval_precision": 0.785067107786007,
+      "eval_recall": 0.772997299729973,
+      "eval_runtime": 0.1057,
+      "eval_samples_per_second": 9462.679,
+      "eval_steps_per_second": 151.403,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 2.3514938488576452e-05,
+      "loss": 0.4431,
+      "step": 300
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 2.087873462214411e-05,
+      "loss": 0.377,
+      "step": 400
+    },
+    {
+      "epoch": 1.35,
+      "eval_accuracy": 0.816,
+      "eval_f1": 0.8159933757615274,
+      "eval_loss": 0.427643358707428,
+      "eval_matthews_correlation": 0.6320128653971173,
+      "eval_precision": 0.815991263965056,
+      "eval_recall": 0.816021602160216,
+      "eval_runtime": 0.1039,
+      "eval_samples_per_second": 9625.637,
+      "eval_steps_per_second": 154.01,
+      "step": 400
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 1.82688927943761e-05,
+      "loss": 0.3443,
+      "step": 500
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 1.563268892794376e-05,
+      "loss": 0.33,
+      "step": 600
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.824,
+      "eval_f1": 0.8239746523499383,
+      "eval_loss": 0.39216598868370056,
+      "eval_matthews_correlation": 0.6479558982194922,
+      "eval_precision": 0.8239935027265344,
+      "eval_recall": 0.8239623962396239,
+      "eval_runtime": 0.1031,
+      "eval_samples_per_second": 9696.512,
+      "eval_steps_per_second": 155.144,
+      "step": 600
+    },
+    {
+      "epoch": 2.36,
+      "learning_rate": 1.2996485061511423e-05,
+      "loss": 0.227,
+      "step": 700
+    },
+    {
+      "epoch": 2.69,
+      "learning_rate": 1.0360281195079087e-05,
+      "loss": 0.2219,
+      "step": 800
+    },
+    {
+      "epoch": 2.69,
+      "eval_accuracy": 0.838,
+      "eval_f1": 0.8379766686402841,
+      "eval_loss": 0.4026987850666046,
+      "eval_matthews_correlation": 0.6767651028795362,
+      "eval_precision": 0.8385613769517563,
+      "eval_recall": 0.8382038203820381,
+      "eval_runtime": 0.1026,
+      "eval_samples_per_second": 9746.534,
+      "eval_steps_per_second": 155.945,
+      "step": 800
+    }
+  ],
+  "max_steps": 1188,
+  "num_train_epochs": 4,
+  "total_flos": 102556265398272.0,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-800/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4e3a85efd6ca1a2228fc0bf6f5ca43150a2981352327acbf61aa5be7e43d49
+size 3707

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "activation_freq": 10,
+  "architectures": [
+    "HyenaDNAForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hyena.HyenaConfig",
+    "AutoModel": "modeling_hyena.HyenaDNAModel",
+    "AutoModelForCausalLM": "modeling_hyena.HyenaDNAForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_hyena.HyenaDNAForSequenceClassification"
+  },
+  "d_inner": 1024,
+  "d_model": 256,
+  "emb_dim": 5,
+  "embed_dropout": 0.1,
+  "filter_order": 64,
+  "hyena_dropout": 0.0,
+  "hyena_filter_dropout": 0.0,
+  "hyena_order": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_seq_len": 32770,
+  "model_type": "hyenadna",
+  "n_layer": 4,
+  "num_inner_mlps": 2,
+  "pad_token_id": 4,
+  "pad_vocab_size_multiple": 8,
+  "problem_type": "single_label_classification",
+  "short_filter_order": 3,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "train_freq": true,
+  "transformers_version": "4.26.1",
+  "use_bias": true,
+  "vocab_size": 12
+}

configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from transformers import PretrainedConfig
+import json
+class HyenaConfig(PretrainedConfig):
+    model_type = "hyenadna"
+    def __init__(
+        self,
+        vocab_size=12,
+        d_model=256,
+        d_inner=None,
+        use_bias=True,
+        train_freq=True,
+        max_seq_len=1024,
+        emb_dim=3,
+        n_layer=12,
+        num_inner_mlps=2,
+        hyena_order=2,
+        short_filter_order=3,
+        filter_order=64,
+        activation_freq=1,
+        embed_dropout=0.1,
+        hyena_dropout=0.0,
+        hyena_filter_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        pad_vocab_size_multiple=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        if d_inner is None:
+            self.d_inner = 4 * d_model
+        else:
+            self.d_inner = d_inner
+        self.use_bias = use_bias
+        self.train_freq = train_freq
+        self.max_seq_len = max_seq_len
+        self.emb_dim = emb_dim
+        self.n_layer = n_layer
+        self.hyena_order = hyena_order
+        self.filter_order = filter_order
+        self.short_filter_order = short_filter_order
+        self.activation_freq = activation_freq
+        self.num_inner_mlps = num_inner_mlps
+        self.embed_dropout = embed_dropout
+        self.hyena_dropout = hyena_dropout
+        self.hyena_filter_dropout = hyena_filter_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        super().__init__(**kwargs)
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        vocab_size = config["vocab_size"]
+        d_model = config["d_model"]
+        d_inner = config["d_inner"]
+        max_seq_len = config["layer"]["l_max"]
+        emb_dim = config["layer"]["emb_dim"]
+        filter_order = config["layer"]["filter_order"]
+        if "local_order" in config["layer"]:
+            short_filter_order = config["layer"]["local_order"]
+        elif "short_filter_order" in config["layer"]:
+            short_filter_order = config["layer"]["short_filter_order"]
+        else:
+            short_filter_order = 3
+        n_layer = config["n_layer"]
+        activation_freq = config["layer"]["w"]
+        embed_dropout = config["embed_dropout"]
+        pad_vocab_size_multiple = config["pad_vocab_size_multiple"]
+        return cls(vocab_size=vocab_size,
+                   d_model=d_model,
+                   d_inner=d_inner,
+                   max_seq_len=max_seq_len,
+                   emb_dim=emb_dim,
+                   filter_order=filter_order,
+                   short_filter_order=short_filter_order,
+                   n_layer=n_layer,
+                   activation_freq=activation_freq,
+                   embed_dropout=embed_dropout,
+                   pad_vocab_size_multiple=pad_vocab_size_multiple,
+                   tie_word_embeddings=False,
+                   **kwargs
+                   )

modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# -*- coding: utf-8 -*-
+"""HyenaDNA custom code port to Hugging Face Hub"""
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .configuration_hyena import HyenaConfig
+from transformers import PreTrainedModel
+from typing import Optional, Tuple, Union
+from transformers.modeling_outputs import CausalLMOutput, SequenceClassifierOutput, BaseModelOutputWithNoAttention
+def fftconv(u, k, D):
+    """
+    We apply a convolution through the fourier domain (from the Convolution Theorem)
+    """
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k.to(torch.float32), n=fft_size) / fft_size
+    u_f = torch.fft.rfft(u.to(dtype=torch.float32), n=fft_size)
+    if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
+    y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
+    out = y + u * D.unsqueeze(-1)
+    return out.to(dtype=u.dtype)
+@torch.jit.script
+def mul_sum(q, y):
+    return (q * y).sum(dim=1)
+class HyenaSin(nn.Module):
+    """The Sin activation function for the Hyena Filter function."""
+    def __init__(self, config):
+        super().__init__()
+        self.freq = nn.Parameter(config.activation_freq * torch.ones(1, config.filter_order)) if config.train_freq else config.activation_freq * torch.ones(1, config.filter_order)
+    def forward(self, x):
+        return torch.sin(self.freq * x)
+class HyenaPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+        self.seq_len = config.max_seq_len
+        # The time embedding fed to the filteres is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None] # 1, L, 1
+        if config.emb_dim > 1:
+            bands = (config.emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, self.seq_len - 1, self.seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.cat([t, torch.cos(-f * w), torch.sin(-f * w)], dim=-1)
+        self.register_buffer("z", z)
+        self.register_buffer("t", t)
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+class HyenaExponentialModulation(nn.Module):
+    """The window function applied to the output of the (MLP) filter function."""
+    def __init__(
+        self,
+        d_model,
+        fast_decay_pct=0.3,
+        slow_decay_pct=1.5,
+        target=1e-2,
+        modulate: bool=True,
+        shift: float = 0.05,
+        **kwargs
+    ):
+        super().__init__()
+        self.modulate = modulate
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        self.register_buffer("deltas", deltas)
+    def forward(self, t, x):
+        if self.modulate:
+            decay = torch.exp(-t * self.deltas.abs())
+            x = x * (decay + self.shift)
+        return x
+class HyenaFilter(nn.Module):
+    def __init__(
+            self,
+            config,
+            **kwargs
+        ):
+        """
+        Implicit long filter with modulation.
+        Args:
+            d_model: number of channels in the input
+            emb_dim: dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            order: width of the FFN
+            num_inner_mlps: number of inner linear layers inside filter MLP
+        Note:
+            filter_dropout is not implemented
+        """
+        super().__init__()
+        self.d_model = config.d_model * (config.hyena_order - 1)
+        self.use_bias = config.use_bias
+        self.bias = nn.Parameter(torch.randn(self.d_model))
+        self.dropout = nn.Dropout(config.hyena_filter_dropout)
+        act = HyenaSin(config)
+        self.emb_dim = config.emb_dim
+        assert self.emb_dim % 2 != 0 and self.emb_dim >= 3, "emb_dim must be odd and greater or equal to 3 (time, sine and cosine)"
+        self.seq_len = config.max_seq_len
+        self.pos_emb = HyenaPositionalEmbedding(config)
+        self.implicit_filter = nn.Sequential(
+            nn.Linear(self.emb_dim, config.filter_order),
+            act,
+        )
+        for i in range(config.num_inner_mlps):
+            self.implicit_filter.append(nn.Linear(config.filter_order, config.filter_order))
+            self.implicit_filter.append(act)
+        self.implicit_filter.append(nn.Linear(config.filter_order, config.d_model, bias=False))
+        self.modulation = HyenaExponentialModulation(config.d_model)
+        self.normalized = False
+    def filter(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter(z.to(dtype=self.implicit_filter[0].weight.dtype))
+        h = self.modulation(t, h)
+        return h
+    def forward(self, x, L, k=None, bias=None, *args, **kwargs):
+        if k is None: k = self.filter(L)
+        # Ensure compatibility with filters that return a tuple
+        k = k[0] if type(k) is tuple else k
+        y = fftconv(x, k, bias)
+        return y
+class HyenaOperator(nn.Module):
+    def __init__(
+            self,
+            config,
+            **filter_args,
+        ):
+        r"""
+        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
+        Args:
+            d_model (int): Dimension of the input and output embeddings (width of the layer)
+            l_max: (int): Maximum input sequence length. Defaults to None
+            order: (int): Depth of the Hyena recurrence. Defaults to 2
+            dropout: (float): Dropout probability. Defaults to 0.0
+            filter_dropout: (float): Dropout probability for the filter. Defaults to 0.0
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.l_max = config.max_seq_len
+        self.order = config.hyena_order
+        inner_width = config.d_model * (self.order + 1)
+        self.dropout = nn.Dropout(config.hyena_dropout)
+        self.in_proj = nn.Linear(self.d_model, inner_width)
+        self.out_proj = nn.Linear(self.d_model, self.d_model)
+        self.short_filter = nn.Conv1d(
+            inner_width,
+            inner_width,
+            config.short_filter_order,
+            padding=2,
+            groups=inner_width
+        )
+        self.filter_fn = HyenaFilter(config)
+    def forward(self, u):
+        l = u.size(-2)
+        l_filter = min(l, self.l_max)
+        u = self.in_proj(u).transpose(1, 2)
+        uc = self.short_filter(u)[...,:l_filter]
+        *x, v = uc.split(self.d_model, dim=1)
+        k = self.filter_fn.filter(l_filter)[0]
+        k = k.transpose(0, 1).reshape(self.order - 1, self.d_model, l_filter)
+        bias = self.filter_fn.bias.reshape(self.order - 1, self.d_model)
+        for o, x_i in enumerate(reversed(x[1:])):
+            v = self.dropout(v * x_i)
+            v = self.filter_fn(v, l_filter, k=k[o], bias=bias[o])
+        y = (v * x[0]).transpose(1, 2)
+        y = self.out_proj(y)
+        return y
+class HyenaMlp(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/mlp.py
+        """
+        super().__init__()
+        in_features = config.d_model
+        hidden_features = config.d_inner
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, config.d_model)
+    def forward(self, x):
+        y = self.fc1(x)
+        y = F.gelu(y, approximate="tanh")
+        y = self.fc2(y)
+        return y
+class HyenaBlock(nn.Module):
+    def __init__(self, config):
+        """
+        From https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/modules/block.py
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
+        the hidden_states (output of the MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
+        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.
+        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
+        This is for performance reason: for post-norm architecture, returning the input allows us
+        to fuse the backward of nn.Linear with the residual connection.
+        """
+        super().__init__()
+        self.mixer = HyenaOperator(config)
+        self.norm1 = nn.LayerNorm(config.d_model)
+        self.mlp = HyenaMlp(config)
+        self.norm2 = nn.LayerNorm(config.d_model)
+    def forward(self, hidden_states):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+        """
+        residual = hidden_states
+        residual = residual.to(torch.float32)
+        hyena_normed = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+        hidden_states = self.mixer(hyena_normed)
+        # Tested above here and all is equivalent. That means the mixer is fine!!!
+        residual = hidden_states + residual
+        hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+        residual = residual.to(torch.float32)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states + residual
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+class HyenaEmbeddings(nn.Module):
+    def __init__(self, config, padding_idx=None):
+        """
+            If max_position_embeddings <= 0, there's no position embeddings
+            If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
+                the project up to embed_dim
+        """
+        super().__init__()
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.word_embeddings = nn.Embedding(vocab_size, config.d_model, padding_idx=padding_idx)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+class HyenaLMBackbone(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        # note max_position_embeddings is 0 for Hyena, and therefore isn't used
+        self.embeddings = HyenaEmbeddings(config)
+        self.dropout = nn.Dropout(config.embed_dropout)
+        self.layers = nn.ModuleList([HyenaBlock(config) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        for layer in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(layer.__call__, hidden_states)
+            else:
+                hidden_states = layer(hidden_states)
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        hidden_states = self.ln_f(hidden_states.to(dtype=self.ln_f.weight.dtype))
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+class HyenaDNAPreTrainedModel(PreTrainedModel):
+    config_class = HyenaConfig
+    base_model_prefix = "hyena"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyenaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]  # Shared tensors that safetensors merges
+    def _init_weights(self, module, initializer_range=0.02):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in self.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+            # If using GLU activation for now, we scale the std by 2
+            elif name in ["output_linear.0.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(p, mean=0.0, std=initializer_range / math.sqrt(2 * self.config.num_layers))
+class HyenaDNAModel(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.backbone = HyenaLMBackbone(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(input_ids, inputs_embeds=inputs_embeds, output_hidden_states=output_hidden_states)
+        if return_dict:
+            return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states,
+                                                  hidden_states=all_hidden_states if output_hidden_states else None)
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.hyena = HyenaDNAModel(config)
+        vocab_size = config.vocab_size
+        if vocab_size % config.pad_vocab_size_multiple != 0:
+            vocab_size += config.pad_vocab_size_multiple - (vocab_size % config.pad_vocab_size_multiple)
+        self.vocab_size = vocab_size
+        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.hyena = decoder
+    def get_decoder(self):
+        return self.hyena
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.hyena(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.hyena = HyenaDNAModel(config)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.hyena.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.hyena.backbone.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.hyena(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

optimizer_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbc2fac38ec5a3bae03043becad4825c1a4afe885a12bdd597f5d263a40d8b55
+size 26307771

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dbaff97807d7961eb4208709c13b67c9b7fb92e5f7418a96202fef0ae7e5dd5
+size 16300157

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_hyena.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from transformers import PreTrainedTokenizer, AddedToken
+from typing import List, Optional, Union, Dict, Sequence, Tuple
+from pathlib import Path
+import json
+import os
+class HyenaDNATokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        # cls = [self.cls_token_id]
+        result = token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    # HyenaDNA has a fixed vocabulary with no vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        return ()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hyena.HyenaDNATokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 256,
+  "name_or_path": "LongSafari/hyenadna-small-32k-seqlen-hf",
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/home/hlv8980/.cache/huggingface/hub/models--LongSafari--hyenadna-small-32k-seqlen-hf/snapshots/8fe770c78eb13fe33bf81501612faeddf4d6f331/special_tokens_map.json",
+  "tokenizer_class": "HyenaDNATokenizer",
+  "unk_token": "[UNK]"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,156 @@

+{
+  "best_metric": 0.39216598868370056,
+  "best_model_checkpoint": "/scratch/hlv8980/Attack_Benchmark/models/hyena/tf4/origin/checkpoint-600",
+  "epoch": 4.0,
+  "global_step": 1188,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.34,
+      "learning_rate": 2.8760984182776802e-05,
+      "loss": 0.5992,
+      "step": 100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 2.615114235500879e-05,
+      "loss": 0.4813,
+      "step": 200
+    },
+    {
+      "epoch": 0.67,
+      "eval_accuracy": 0.774,
+      "eval_f1": 0.7713328260834371,
+      "eval_loss": 0.48207539319992065,
+      "eval_matthews_correlation": 0.5579338694412199,
+      "eval_precision": 0.785067107786007,
+      "eval_recall": 0.772997299729973,
+      "eval_runtime": 0.1057,
+      "eval_samples_per_second": 9462.679,
+      "eval_steps_per_second": 151.403,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 2.3514938488576452e-05,
+      "loss": 0.4431,
+      "step": 300
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 2.087873462214411e-05,
+      "loss": 0.377,
+      "step": 400
+    },
+    {
+      "epoch": 1.35,
+      "eval_accuracy": 0.816,
+      "eval_f1": 0.8159933757615274,
+      "eval_loss": 0.427643358707428,
+      "eval_matthews_correlation": 0.6320128653971173,
+      "eval_precision": 0.815991263965056,
+      "eval_recall": 0.816021602160216,
+      "eval_runtime": 0.1039,
+      "eval_samples_per_second": 9625.637,
+      "eval_steps_per_second": 154.01,
+      "step": 400
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 1.82688927943761e-05,
+      "loss": 0.3443,
+      "step": 500
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 1.563268892794376e-05,
+      "loss": 0.33,
+      "step": 600
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.824,
+      "eval_f1": 0.8239746523499383,
+      "eval_loss": 0.39216598868370056,
+      "eval_matthews_correlation": 0.6479558982194922,
+      "eval_precision": 0.8239935027265344,
+      "eval_recall": 0.8239623962396239,
+      "eval_runtime": 0.1031,
+      "eval_samples_per_second": 9696.512,
+      "eval_steps_per_second": 155.144,
+      "step": 600
+    },
+    {
+      "epoch": 2.36,
+      "learning_rate": 1.2996485061511423e-05,
+      "loss": 0.227,
+      "step": 700
+    },
+    {
+      "epoch": 2.69,
+      "learning_rate": 1.0360281195079087e-05,
+      "loss": 0.2219,
+      "step": 800
+    },
+    {
+      "epoch": 2.69,
+      "eval_accuracy": 0.838,
+      "eval_f1": 0.8379766686402841,
+      "eval_loss": 0.4026987850666046,
+      "eval_matthews_correlation": 0.6767651028795362,
+      "eval_precision": 0.8385613769517563,
+      "eval_recall": 0.8382038203820381,
+      "eval_runtime": 0.1026,
+      "eval_samples_per_second": 9746.534,
+      "eval_steps_per_second": 155.945,
+      "step": 800
+    },
+    {
+      "epoch": 3.03,
+      "learning_rate": 7.724077328646749e-06,
+      "loss": 0.2121,
+      "step": 900
+    },
+    {
+      "epoch": 3.37,
+      "learning_rate": 5.087873462214412e-06,
+      "loss": 0.1388,
+      "step": 1000
+    },
+    {
+      "epoch": 3.37,
+      "eval_accuracy": 0.857,
+      "eval_f1": 0.8566558306493891,
+      "eval_loss": 0.393052339553833,
+      "eval_matthews_correlation": 0.7159331394438886,
+      "eval_precision": 0.859342750257998,
+      "eval_recall": 0.8565956595659566,
+      "eval_runtime": 0.1036,
+      "eval_samples_per_second": 9656.574,
+      "eval_steps_per_second": 154.505,
+      "step": 1000
+    },
+    {
+      "epoch": 3.7,
+      "learning_rate": 2.4516695957820737e-06,
+      "loss": 0.1394,
+      "step": 1100
+    },
+    {
+      "epoch": 4.0,
+      "step": 1188,
+      "total_flos": 152279543808000.0,
+      "train_loss": 0.30630172623528373,
+      "train_runtime": 38.7189,
+      "train_samples_per_second": 1962.866,
+      "train_steps_per_second": 30.683
+    }
+  ],
+  "max_steps": 1188,
+  "num_train_epochs": 4,
+  "total_flos": 152279543808000.0,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4e3a85efd6ca1a2228fc0bf6f5ca43150a2981352327acbf61aa5be7e43d49
+size 3707