Upload 13 files

Browse files

Files changed (13) hide show

__init__.py +0 -0
config.json +30 -0
configuration_rwkv7.py +129 -0
cuda/state_wkv7_cuda.cu +152 -0
cuda/state_wkv7_op.cpp +34 -0
cuda/wkv7_cuda.cu +138 -0
cuda/wkv7_op.cpp +34 -0
modeling_blocks_rwkv7.py +0 -0
modeling_rwkv7.py +460 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "RWKV7ForCasualLM",
+    "RWKV7Model",
+    "RWKV7PreTrainedModel"
+  ],
+  "bos_token_id": 0,
+  "device": null,
+  "dropout_rate": 0.0,
+  "dtype": null,
+  "eos_token_id": 0,
+  "head_size": 64,
+  "hidden_size": 2048,
+  "hidden_size_att": 2048,
+  "hidden_size_ffn": 8192,
+  "init_state_wkv": false,
+  "layer_id": null,
+  "model_type": "rwkv7",
+  "num_hidden_layers": 24,
+  "tie_word_embeddings": false,
+  "tmix_backend": "auto",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0",
+  "vocab_size": 50304,
+  "auto_map": {
+    "AutoConfig": "configuration_rwkv7.RWKV7Config",
+    "AutoModel": "modeling_rwkv7.RWKV7Model",
+    "AutoModelForCasualLM": "modeling_rwkv7.RWKV7ForCasualLM"
+  }
+}

configuration_rwkv7.py ADDED Viewed

	@@ -0,0 +1,129 @@

+""" RWKV configuration"""
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.utils import logging
+# logger = logging.get_logger(__name__)
+# Import the dependencies
+from .modeling_blocks_rwkv7 import RWKV7GooseConfigMap
+class RWKV7Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Rwkv7Model`]. It is used to instantiate a RWKV7
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the RWVK-7
+    [RWKV/v7-Goose-1.6B-Pile-HF](https://huggingface.co/RWKV/v7-Goose-1.6B-Pile-HF) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the RWKV7 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Rwkv7Model`].
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        hidden_size_att (`int`, *optional*):
+            Dimensionality of the attention hidden states. Will be computed from `hidden_size` if unset.
+        hidden_size_ffn (`int`, *optional*):
+            Dimensionality of the FFN hidden states. Will be computed from `hidden_size` if unset.
+        head_size (`int`, *optional*, defaults to 64):
+            head_size of rwkv7 self_attention module.
+        tmix_backend (`str`, *optional*, defaults to "auto"):
+            Backend to use for the time mix module. "auto" defaults to "pytorch" if the device is "cpu" and "cuda" otherwise.
+            (Valid values: "auto", "pytorch", "cuda", "triton", "triton_bighead", "fla", "fla_fused", "pytorch_ref", "pytorch_ref_fp32")
+        init_state_wkv (`bool`, *optional*, defaults to `False`):
+            Whether to initialize the wkv state in the model. Used for WKV state tuning.
+        device (`str`, *optional*):
+            Device to use for the model. Use the respective torch.device types
+        dtype (`str`, *optional*):
+            Model weights data type. Use the respective torch.dtype types
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary. Defaults to 0.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary. Defaults to 0.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the input token embeddings.
+            (this value is currently ignored in our implementation)
+    Example:
+    ```python
+    >>> from transformers import Rwkv7Config, Rwkv7Model
+    >>> # Initializing a Rwkv7 configuration
+    >>> configuration = Rwkv7Config()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Rwkv7Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "rwkv7"
+    def __init__(
+        self,
+        ########################################
+        # Vocab, layer count, and hidden size
+        vocab_size=65536,
+        num_hidden_layers=24,
+        hidden_size=768,
+        # Optional hidden sizes
+        hidden_size_att=None,
+        hidden_size_ffn=None,
+        # Headsize, timemix backend
+        head_size=64,
+        tmix_backend="auto",
+        init_state_wkv=False,
+        # Trainer model configs
+        dropout_rate=0.0,
+        # Torch device and dtype
+        device=None,
+        dtype=None,
+        # Tokenizer related settings in HF configuration
+        bos_token_id=0,
+        eos_token_id=0,
+        tie_word_embeddings=False,
+        ########################################
+        **kwargs,
+    ):
+        # Normalize dtype if torch_dtype is set within kwargs
+        if dtype is None and "torch_dtype" in kwargs:
+            dtype = kwargs["torch_dtype"]
+        self.vocab_size = vocab_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.hidden_size_att = hidden_size_att
+        self.hidden_size_ffn = hidden_size_ffn
+        self.head_size = head_size
+        self.tmix_backend = tmix_backend
+        self.init_state_wkv = init_state_wkv
+        self.device = device
+        self.dtype = dtype
+        self.dropout_rate = dropout_rate
+        # Forward to the HF PretrainedConfig
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+    @staticmethod
+    def from_model_state_dict(state_dict: dict, **kwargs):
+        goose_config = RWKV7GooseConfigMap.from_model_state_dict(state_dict)
+        # Join dictionary with **goose_config.__dict__ and **kwargs
+        return RWKV7Config(**{**goose_config.__dict__, **kwargs})

cuda/state_wkv7_cuda.cu ADDED Viewed

	@@ -0,0 +1,152 @@

+#include <cuda_bf16.h>
+#include <assert.h>
+using bf = __nv_bfloat16;
+__device__ inline float to_float(const bf & u) { return __bfloat162float(u); }
+__device__ inline bf to_bf(const float & u) { return __float2bfloat16_rn(u); }
+typedef bf * __restrict__ F_;
+__global__ void forward_kernel(int T, int H, float*_state, F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, bf* y_, float* s_, float* sa_) {
+    constexpr int C = _C_;
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    int s_idx = bb*H*C*C + hh*C*C + i*C;
+    #pragma unroll
+    for (int j = 0; j < C; j++) {
+        state[j] = _state[s_idx+j];
+    }
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    for (int t = 0; t < T; t++) {
+        int ind = bb*T*H*C + t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            sa += a[j] * state[j];
+        }
+        sa_[ind] = sa;
+        float v = to_float(v_[ind]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            float& s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int base = (bb*H+hh)*(T/_CHUNK_LEN_)*C*C + (t/_CHUNK_LEN_)*C*C + i;
+            #pragma unroll
+            for (int j = 0; j < C; j++) {
+                s_[base + j*C] = state[j];
+            }
+        }
+    }
+    #pragma unroll
+    for (int j = 0; j < C; j++) {
+        _state[s_idx+j] = state[j];
+    }
+    __syncthreads();
+}
+__global__ void backward_kernel(int T, int H, float*_state, F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, F_ dy_, float * __restrict__ s_, float * __restrict__ sa_, bf* dw_, bf* dq_, bf* dk_, bf* dv_, bf* da_, bf* db_) {
+    constexpr int C = _C_;
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float stateT[C] = {0}, dstate[C] = {0}, dstateT[C] = {0};
+    __shared__ float w[C], q[C], k[C], v[C], a[C], b[C], dy[C], sa[C], dSb_shared[C];
+    float qi, wi, ki, ai, bi, dyi;
+    for (int t = T-1; t >= 0; t--) {
+        int ind = bb*T*H*C + t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = qi = to_float(q_[ind]);
+        float wi_fac = -__expf(to_float(w_[ind]));
+        w[i] = wi = __expf(wi_fac);
+        k[i] = ki = to_float(k_[ind]);
+        a[i] = ai = to_float(a_[ind]);
+        b[i] = bi = to_float(b_[ind]);
+        v[i] = to_float(v_[ind]);
+        dy[i] = dyi = to_float(dy_[ind]);
+        sa[i] = sa_[ind];
+        __syncthreads();
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int base = (bb*H+hh)*(T/_CHUNK_LEN_)*C*C + (t/_CHUNK_LEN_)*C*C + i*C;
+            #pragma unroll
+            for (int j = 0; j < C; j++) {
+                stateT[j] = s_[base + j];
+            }
+        }
+        float dq = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dq += stateT[j]*dy[j];
+        }
+        dq_[ind] = to_bf(dq);
+        float iwi = 1.0f/wi;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            stateT[j] = (stateT[j] - ki*v[j] - bi*sa[j]) * iwi;
+            dstate[j] += dyi * q[j];
+            dstateT[j] += qi * dy[j];
+        }
+        float dw = 0, dk = 0, dv = 0, db = 0, dSb = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dw += dstateT[j]*stateT[j];
+            dk += dstateT[j]*v[j];
+            dv += dstate[j]*k[j];
+            dSb += dstate[j]*b[j];
+            db += dstateT[j]*sa[j];
+        }
+        dw_[ind] = to_bf(dw * wi * wi_fac);
+        dk_[ind] = to_bf(dk);
+        dv_[ind] = to_bf(dv);
+        db_[ind] = to_bf(db);
+        __syncthreads();
+        dSb_shared[i] = dSb;
+        __syncthreads();
+        float da = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            da += stateT[j]*dSb_shared[j];
+        }
+        da_[ind] = to_bf(da);
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dstate[j] = dstate[j]*w[j] + dSb * a[j];
+            dstateT[j] = dstateT[j]*wi + ai * dSb_shared[j];
+        }
+    }
+}
+void cuda_forward(int B, int T, int H, float*_state, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa) {
+    forward_kernel<<<dim3(H,B), dim3(_C_)>>>(T,H,_state,w,q,k,v,z,a,y,s,sa);
+}
+void cuda_backward(int B, int T, int H, float*_state, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy, float*s, float*sa, bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da) {
+    assert(T%_CHUNK_LEN_ == 0);
+    backward_kernel<<<dim3(H,B), dim3(_C_)>>>(T,H,_state,w,q,k,v,z,a,dy,s,sa,dw,dq,dk,dv,dz,da);
+}

cuda/state_wkv7_op.cpp ADDED Viewed

	@@ -0,0 +1,34 @@

+#include <torch/extension.h>
+#include <cuda_bf16.h>
+using bf = __nv_bfloat16;
+void cuda_forward(int B, int T, int H, float*_state, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa);
+void forward(torch::Tensor &_state, torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v, torch::Tensor &z, torch::Tensor &a, torch::Tensor &y, torch::Tensor &s, torch::Tensor &sa) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_forward(B, T, H, (float*)_state.data_ptr(), (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(), (bf*)y.data_ptr(), (float*)s.data_ptr(), (float*)sa.data_ptr());
+}
+void cuda_backward(int B, int T, int H, float*_state, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy, float*s, float*sa, bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da);
+void backward(torch::Tensor &_state, torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v, torch::Tensor &z, torch::Tensor &a, torch::Tensor &dy,
+        torch::Tensor &s, torch::Tensor &sa, torch::Tensor &dw, torch::Tensor &dq, torch::Tensor &dk, torch::Tensor &dv, torch::Tensor &dz, torch::Tensor &da) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_backward(B, T, H, (float*)_state.data_ptr(), (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(), (bf*)dy.data_ptr(),
+            (float*)s.data_ptr(), (float*)sa.data_ptr(), (bf*)dw.data_ptr(), (bf*)dq.data_ptr(), (bf*)dk.data_ptr(), (bf*)dv.data_ptr(), (bf*)dz.data_ptr(), (bf*)da.data_ptr());
+}
+TORCH_LIBRARY(state_wind_backstepping, m) {
+    m.def("forward(Tensor _state, Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor(a!) y, Tensor(b!) s, Tensor(c!) sa) -> ()");
+    m.def("backward(Tensor _state, Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor dy, Tensor s, Tensor sa, Tensor(a!) dw, Tensor(b!) dq, Tensor(c!) dk, Tensor(d!) dv, Tensor(e!) dz, Tensor(f!) da) -> ()");
+}
+TORCH_LIBRARY_IMPL(state_wind_backstepping, CUDA, m) {
+    m.impl("forward", &forward);
+    m.impl("backward", &backward);
+}
+// TORCH_LIBRARY(state_wind_backstepping, m) {
+//     m.def("forward", forward);
+//     m.def("backward", backward);
+// }

cuda/wkv7_cuda.cu ADDED Viewed

	@@ -0,0 +1,138 @@

+#include <cuda_bf16.h>
+#include <assert.h>
+using bf = __nv_bfloat16;
+__device__ inline float to_float(const bf & u) { return __bfloat162float(u); }
+__device__ inline bf to_bf(const float & u) { return __float2bfloat16_rn(u); }
+typedef bf * __restrict__ F_;
+__global__ void forward_kernel(int T, int H, F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, bf* y_, float* s_, float* sa_) {
+    constexpr int C = _C_;
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    for (int t = 0; t < T; t++) {
+        int ind = bb*T*H*C + t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            sa += a[j] * state[j];
+        }
+        sa_[ind] = sa;
+        float v = to_float(v_[ind]);
+        float y = 0;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            float& s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int base = (bb*H+hh)*(T/_CHUNK_LEN_)*C*C + (t/_CHUNK_LEN_)*C*C + i;
+#pragma unroll
+            for (int j = 0; j < C; j++) {
+                s_[base + j*C] = state[j];
+            }
+        }
+    }
+}
+__global__ void backward_kernel(int T, int H, F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, F_ dy_, float * __restrict__ s_, float * __restrict__ sa_, bf* dw_, bf* dq_, bf* dk_, bf* dv_, bf* da_, bf* db_) {
+    constexpr int C = _C_;
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float stateT[C] = {0}, dstate[C] = {0}, dstateT[C] = {0};
+    __shared__ float w[C], q[C], k[C], v[C], a[C], b[C], dy[C], sa[C], dSb_shared[C];
+    float qi, wi, ki, ai, bi, dyi;
+    for (int t = T-1; t >= 0; t--) {
+        int ind = bb*T*H*C + t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = qi = to_float(q_[ind]);
+        float wi_fac = -__expf(to_float(w_[ind]));
+        w[i] = wi = __expf(wi_fac);
+        k[i] = ki = to_float(k_[ind]);
+        a[i] = ai = to_float(a_[ind]);
+        b[i] = bi = to_float(b_[ind]);
+        v[i] = to_float(v_[ind]);
+        dy[i] = dyi = to_float(dy_[ind]);
+        sa[i] = sa_[ind];
+        __syncthreads();
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int base = (bb*H+hh)*(T/_CHUNK_LEN_)*C*C + (t/_CHUNK_LEN_)*C*C + i*C;
+#pragma unroll
+            for (int j = 0; j < C; j++) {
+                stateT[j] = s_[base + j];
+            }
+        }
+        float dq = 0;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            dq += stateT[j]*dy[j];
+        }
+        dq_[ind] = to_bf(dq);
+        float iwi = 1.0f/wi;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            stateT[j] = (stateT[j] - ki*v[j] - bi*sa[j]) * iwi;
+            dstate[j] += dyi * q[j];
+            dstateT[j] += qi * dy[j];
+        }
+        float dw = 0, dk = 0, dv = 0, db = 0, dSb = 0;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            dw += dstateT[j]*stateT[j];
+            dk += dstateT[j]*v[j];
+            dv += dstate[j]*k[j];
+            dSb += dstate[j]*b[j];
+            db += dstateT[j]*sa[j];
+        }
+        dw_[ind] = to_bf(dw * wi * wi_fac);
+        dk_[ind] = to_bf(dk);
+        dv_[ind] = to_bf(dv);
+        db_[ind] = to_bf(db);
+        __syncthreads();
+        dSb_shared[i] = dSb;
+        __syncthreads();
+        float da = 0;
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            da += stateT[j]*dSb_shared[j];
+        }
+        da_[ind] = to_bf(da);
+#pragma unroll
+        for (int j = 0; j < C; j++) {
+            dstate[j] = dstate[j]*w[j] + dSb * a[j];
+            dstateT[j] = dstateT[j]*wi + ai * dSb_shared[j];
+        }
+    }
+}
+void cuda_forward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa) {
+    forward_kernel<<<dim3(H,B), dim3(_C_)>>>(T,H,w,q,k,v,z,a,y,s,sa);
+}
+void cuda_backward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy, float*s, float*sa, bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da) {
+    assert(T%_CHUNK_LEN_ == 0);
+    backward_kernel<<<dim3(H,B), dim3(_C_)>>>(T,H,w,q,k,v,z,a,dy,s,sa,dw,dq,dk,dv,dz,da);
+}

cuda/wkv7_op.cpp ADDED Viewed

	@@ -0,0 +1,34 @@

+#include <torch/extension.h>
+#include <cuda_bf16.h>
+using bf = __nv_bfloat16;
+void cuda_forward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa);
+void forward(torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v, torch::Tensor &z, torch::Tensor &a, torch::Tensor &y, torch::Tensor &s, torch::Tensor &sa) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_forward(B, T, H, (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(), (bf*)y.data_ptr(), (float*)s.data_ptr(), (float*)sa.data_ptr());
+}
+void cuda_backward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy, float*s, float*sa, bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da);
+void backward(torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v, torch::Tensor &z, torch::Tensor &a, torch::Tensor &dy,
+        torch::Tensor &s, torch::Tensor &sa, torch::Tensor &dw, torch::Tensor &dq, torch::Tensor &dk, torch::Tensor &dv, torch::Tensor &dz, torch::Tensor &da) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_backward(B, T, H, (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(), (bf*)dy.data_ptr(),
+            (float*)s.data_ptr(), (float*)sa.data_ptr(), (bf*)dw.data_ptr(), (bf*)dq.data_ptr(), (bf*)dk.data_ptr(), (bf*)dv.data_ptr(), (bf*)dz.data_ptr(), (bf*)da.data_ptr());
+}
+TORCH_LIBRARY(wind_backstepping, m) {
+    m.def("forward(Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor(a!) y, Tensor(b!) s, Tensor(c!) sa) -> ()");
+    m.def("backward(Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor dy, Tensor s, Tensor sa, Tensor(a!) dw, Tensor(b!) dq, Tensor(c!) dk, Tensor(d!) dv, Tensor(e!) dz, Tensor(f!) da) -> ()");
+}
+TORCH_LIBRARY_IMPL(wind_backstepping, CUDA, m) {
+    m.impl("forward", &forward);
+    m.impl("backward", &backward);
+}
+// TORCH_LIBRARY(wind_backstepping, m) {
+//     m.def("forward", forward);
+//     m.def("backward", backward);
+// }

modeling_blocks_rwkv7.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_rwkv7.py ADDED Viewed

	@@ -0,0 +1,460 @@

+""" RWKV Modeling"""
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import ModelOutput
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+import warnings
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple, Union, Any
+# Load the RWKV7Config and RWKV7GooseModel
+from .configuration_rwkv7 import RWKV7Config
+from .modeling_blocks_rwkv7 import RWKV7GooseModel
+class RWKV7PreTrainedModel(PreTrainedModel,RWKV7GooseModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+    """
+    config_class = RWKV7Config
+    base_model_prefix = "rwkv7"
+    is_parallelizable = True
+    _no_split_modules = ["RWKV7LayerBlock"]
+    _keep_in_fp32_modules = []
+    supports_gradient_checkpointing = True
+    def __init__(self, config: RWKV7Config):
+        RWKV7GooseModel.__init__(self, config.__dict__)
+        self.config = config
+    def _init_weights(
+        self,
+        module
+    ):
+        # Fallback to the default init weights
+        if hasattr(module, 'reset_parameters'):
+            module.reset_parameters()
+            return
+        elif hasattr(module, 'init_parameters'):
+            module.init_parameters()
+            return
+        # Default FP initializer_range for Linear / LN layers
+        initializer_range = 0.02
+        if isinstance(module, (nn.ParameterList, nn.ModuleList)):
+            # Iterate and initialize each parameter
+            for param in module:
+                self._init_weights(param)
+        elif isinstance(module, nn.ParameterDict):
+            # Iterate and initialize each parameter
+            for key, param in module.items():
+                self._init_weights(param)
+        elif isinstance(module, (nn.Linear, nn.Conv1d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            nn.init.normal_(module.weight, mean=0.0, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.normal_(module.weight, mean=0.0, std=initializer_range)
+        elif isinstance(module, nn.Parameter):
+            nn.init.normal_(module, mean=0.0, std=initializer_range)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=initializer_range)
+            # # RWKV does not use a blank pad idx. The pad_idx is a training token
+            # if module.padding_idx is not None:
+            #     module.weight.data[module.padding_idx].zero_()
+@dataclass
+class RWKV7Output(ModelOutput):
+    """
+    Class for the RWKV model outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    rwkv_state: Optional[list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class RWKV7CausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    rwkv_state: Optional[list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+RWKV7_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config ([`Rwkv7Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+RWKV7_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary. If `past_key_values` is used, only `input_ids` that do not have their
+            past calculated should be passed as `input_ids`. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        state (List block states, representing the RWKV various internal states per layer `(batch_size, hidden_state)`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare RWKV7 Model transformer outputting raw hidden-states without activating the head (variable is still declared)",
+    RWKV7_START_DOCSTRING,
+)
+class RWKV7Model(RWKV7PreTrainedModel):
+    def __init__(self, config: RWKV7Config):
+        super().__init__(config)
+    def get_input_embeddings(self):
+        return self.emb
+    def set_input_embeddings(self, value):
+        self.emb = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(RWKV7_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        output_type=RWKV7Output,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # not in use
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        rwkv_state: Optional[list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, RWKV7Output]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if output_attentions:
+            warnings.warning_once("`RWKV7Model` does not `output_attentions` now, setting it to `False`.")
+            output_attentions = False
+        if self.gradient_checkpointing and self.training and use_cache:
+            warnings.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+            use_cache = False
+        if self.gradient_checkpointing and self.training and use_cache:
+            warnings.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+            use_cache = False
+        if output_hidden_states:
+            warnings.warning_once("`RWKV7Model` does not `output_hidden_states` now, setting it to `False`.")
+            output_hidden_states = False
+        # ---
+        # Compute the input embeddings
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.emb(input_ids.to(self.emb.weight.device))
+        x_hidden_state = inputs_embeds
+        # Initialize the rwkv_state / prv_stateList
+        if rwkv_state is None or use_cache == False:
+            rwkv_state = self.get_init_state(batch_size=x_hidden_state.shape[0])
+        prv_stateList = rwkv_state
+        # Initialize the ret_stateList
+        ret_stateList = self.get_init_state(batch_size=x_hidden_state.shape[0], skip_init_state=True)
+        all_hidden_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        v_first = None
+        ret_sublist = None
+        # Lets start iterating the blocks
+        for i, block in enumerate(self.blocks):
+            # Build the full inner hidden state
+            if output_hidden_states:
+                all_hidden_states += (x_hidden_state,)
+            # Forward the block
+            if self.gradient_checkpointing and self.training:
+                x_hidden_state, ret_sublist, v_first = self._gradient_checkpointing_func(
+                    block.__call__, x_hidden_state, prv_stateList[i], v_first
+                )
+                ret_stateList[i] = ret_sublist
+            else:
+                x_hidden_state, ret_sublist, v_first = block(x_hidden_state, prv_stateList[i], v_first)
+                ret_stateList[i] = ret_sublist
+            # if output_attentions:
+            #     all_attns += (ret_sublist,)
+        # Final layer norm
+        x_hidden_state = x_hidden_state.to(self.ln_out.weight.device, non_blocking=True)
+        x_hidden_state = self.ln_out(x_hidden_state)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (x_hidden_state,)
+        if not return_dict:
+            return tuple(i for i in [x_hidden_state, rwkv_state, all_hidden_states, all_attns] if i is not None)
+        return RWKV7Output(
+            last_hidden_state=x_hidden_state,
+            rwkv_state=rwkv_state,
+            hidden_states=all_hidden_states,
+            attentions=all_attns
+        )
+@add_start_docstrings(
+    """
+    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    RWKV7_START_DOCSTRING,
+)
+class RWKV7ForCausalLM(RWKV7Model, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.post_init()
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: bool = True,
+        rwkv_state: Optional[list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]] = None,
+        # num_new_tokens_if_rwkv_state: int = 1, # Only triggers if given input_ids + rwkv_state
+        num_logits_to_keep: Optional[int] = None,
+        **kwargs
+    ):
+        '''
+        Personal Notes: On huggingface barely documented "Transformer" hooks.
+        I assume this is triggered once, for the start of AI inference.
+        With subsequent calls for forward on each token step, being updated with
+        `_update_model_kwargs_for_generation` function instead?
+        '''
+        # # only last token for `inputs_ids` if the `past_key_values` is passed along.
+        # if rwkv_state is not None and input_ids is not None:
+        #     input_ids = input_ids[:, -num_new_tokens_if_rwkv_state:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None:
+            if input_ids is not None:
+                raise ValueError("You cannot specify both `inputs_ids` and `inputs_embeds` at the same time")
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard.
+            # Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {'input_ids': input_ids.contiguous()}
+        if num_logits_to_keep is not None:
+            model_inputs['num_logits_to_keep'] = num_logits_to_keep
+        model_inputs.update({
+            'rwkv_state': rwkv_state,
+            'use_cache': use_cache,
+            'attention_mask': attention_mask,
+            'num_logits_to_keep': num_logits_to_keep,
+        })
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        num_new_tokens: int = 1,
+        **kwargs
+    ) -> Dict[str, Any]:
+        # Overwritten -- this model uses `state`, but doesn't have a cache (`past_key_values`)
+        rwkv_state = outputs.get("rwkv_state", None)
+        input_ids = model_kwargs.get("input_ids", None)
+        attention_mask = model_kwargs.get("attention_mask", None)
+        # only last token for inputs_ids if the state is passed along.
+        if rwkv_state is not None and input_ids is not None and num_new_tokens > 0:
+            input_ids = input_ids[:, -num_new_tokens:]
+            model_kwargs["input_ids"] = input_ids
+            if attention_mask is not None:
+                attention_mask = attention_mask.new_ones((attention_mask.shape[0], num_new_tokens))
+                model_kwargs["attention_mask"] = attention_mask
+        # Return the formated output
+        return model_kwargs
+    @add_start_docstrings_to_model_forward(RWKV7_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        output_type=RWKV7CausalLMOutput,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        rwkv_state: Optional[list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, RWKV7CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        rwkv_outputs = RWKV7Model.forward(
+            self, input_ids, attention_mask, inputs_embeds,
+            rwkv_state, use_cache, output_attentions, output_hidden_states,
+            return_dict=False
+        )
+        # Get the hidden state, and the updated RWKV state
+        hidden_states = rwkv_outputs[0]
+        rwkv_state = rwkv_outputs[1]
+        # Get the ALL hidden states and attentions dumps
+        all_hidden_states = rwkv_outputs[2] if output_hidden_states else None
+        if output_hidden_states:
+            all_attns = rwkv_outputs[3] if output_attentions else None
+        else:
+            all_attns = rwkv_outputs[2] if output_attentions else None
+        # Forward the head state
+        logits = self.head(hidden_states)
+        # Compute the loss from the labels
+        loss = None
+        if labels is not None:
+            # Setup loss function
+            if self._loss_function_cache is None:
+                self._loss_function_cache = CrossEntropyLoss()
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Compute the token loss
+            if attention_mask is not None:
+                token_loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction="none")
+                submask = attention_mask[..., 1:].contiguous().view(-1)
+                loss = (token_loss * submask).sum() / submask.sum()
+            else:
+                loss = F.cross_entropy(shift_logits.view(-1, shift_labels.size(-1)), shift_labels.view(-1), reduction="mean")
+        if not return_dict:
+            return tuple(i for i in [loss, logits, rwkv_state, all_hidden_states, all_attns] if i is not None)
+        return RWKV7CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            rwkv_state=rwkv_state,
+            hidden_states=all_hidden_states,
+            attentions=all_attns,
+        )
+__all__ = ["RWKV7ForCausalLM", "RWKV7Model", "RWKV7PreTrainedModel"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<\|endoftext\|>", "eos_token": "<\|endoftext\|>", "unk_token": "<\|endoftext\|>"}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<\|endoftext\|>", "bos_token": "<\|endoftext\|>", "eos_token": "<\|endoftext\|>", "add_prefix_space": false, "tokenizer_class": "GPTNeoXTokenizer"}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff