# from tvm.script import ir as I
# from tvm.script import tir as T
# from tvm.script import relax as R

@I.ir_module
class Module:
    I.module_attrs({"external_mods": [metadata["runtime.Module"][0], metadata["runtime.Module"][1], metadata["runtime.Module"][2], metadata["runtime.Module"][3], metadata["runtime.Module"][4], metadata["runtime.Module"][5], metadata["runtime.Module"][6], metadata["runtime.Module"][7], metadata["runtime.Module"][8], metadata["runtime.Module"][9], metadata["runtime.Module"][10], metadata["runtime.Module"][11], metadata["runtime.Module"][12], metadata["runtime.Module"][13], metadata["runtime.Module"][14]]})
    @T.prim_func
    def NT_matmul(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local")
        layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(1)):
                                        with T.block("layer_norm356_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3)
                                            T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280))
                                            T.reads(layer_norm356[v0, v1, v2])
                                            T.writes(layer_norm356_shared[v0, v1, v2])
                                            layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.reads()
                                T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(4)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1])
                                    T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1])
                                    model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2])
                                    T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)])
                                    T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.reads()
                                T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0)
                            v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul[T.int64(0), T.int64(0), v0] = T.float16(0)
                            NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]

    @T.prim_func
    def NT_matmul3(layer_norm452: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(51866)), "float32")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(51866)), scope="local")
        NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(51866)), scope="local")
        model_decoder_embed_tokens_weight5_local = T.alloc_buffer((T.int64(51866), T.int64(1280)), "float16", scope="local")
        layer_norm452_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(12967), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(1)):
                                        with T.block("layer_norm452_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3)
                                            T.reads(layer_norm452[v0, v1, v2])
                                            T.writes(layer_norm452_shared[v0, v1, v2])
                                            layer_norm452_shared[v0, v1, v2] = layer_norm452[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init < T.int64(51866))
                                T.reads()
                                T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float32(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(2)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_embed_tokens_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 < T.int64(51866))
                                    T.reads(model_decoder_embed_tokens_weight5[v0, v1])
                                    T.writes(model_decoder_embed_tokens_weight5_local[v0, v1])
                                    model_decoder_embed_tokens_weight5_local[v0, v1] = model_decoder_embed_tokens_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0])
                                    T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2 < T.int64(51866))
                                    T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused])
                                    T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + T.Cast("float32", layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) * T.Cast("float32", model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused])
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0)
                                v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866))
                                T.reads()
                                T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float32(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866))
                                    T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0)
                            v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax1_fused_0_ax1_fused_1_fused % T.int64(4) + ax1_fused_2) < T.int64(51866))
                            T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul[T.int64(0), T.int64(0), v0] = T.float32(0)
                            NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]

    @T.prim_func
    def add(var_reshape708: T.handle, var_reshape709: T.handle, var_T_add: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape708 = T.match_buffer(var_reshape708, (batch_size, T.int64(1), T.int64(1280)), "float16")
        reshape709 = T.match_buffer(var_reshape709, (batch_size, T.int64(1), T.int64(1280)), "float16")
        T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_add"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(reshape708[v0, T.int64(0), v1], reshape709[v0, T.int64(0), v1])
                    T.writes(T_add[v0, T.int64(0), v1])
                    T_add[v0, T.int64(0), v1] = reshape708[v0, T.int64(0), v1] + reshape709[v0, T.int64(0), v1]

    @T.prim_func
    def add4(var_add: T.handle, var_lv610: T.handle, var_T_add: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        lv610 = T.match_buffer(var_lv610, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_add"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280))
                    T.reads(add[v0, v1, v2], lv610[v0, v1, v2])
                    T.writes(T_add[v0, v1, v2])
                    T_add[v0, v1, v2] = add[v0, v1, v2] + lv610[v0, v1, v2]

    @T.prim_func
    def add5(var_reshape385: T.handle, var_reshape386: T.handle, var_T_add: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        reshape385 = T.match_buffer(var_reshape385, (T.int64(1), seq_len, T.int64(1280)), "float16")
        reshape386 = T.match_buffer(var_reshape386, (T.int64(1), seq_len, T.int64(1280)), "float16")
        T_add = T.match_buffer(var_T_add, (T.int64(1), seq_len, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_add"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280))
                    T.reads(reshape385[T.int64(0), v0, v1], reshape386[T.int64(0), v0, v1])
                    T.writes(T_add[T.int64(0), v0, v1])
                    T_add[T.int64(0), v0, v1] = reshape385[T.int64(0), v0, v1] + reshape386[T.int64(0), v0, v1]

    @T.prim_func
    def apply_bitmask_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_bitmask: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True)
        logits = T.match_buffer(var_logits, (batch_size, vocab_size))
        num_seq = T.int32(is_size_var=True)
        seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32")
        bitmask = T.match_buffer(var_bitmask, (batch_size, (vocab_size + 31) // 32), "int32")
        # with T.block("root"):
        for fused_s_v_0 in T.thread_binding((num_seq * vocab_size + 1023) // 1024, thread="blockIdx.x"):
            for fused_s_v_1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("block"):
                    vs = T.axis.spatial(num_seq, (fused_s_v_0 * 1024 + fused_s_v_1) // vocab_size)
                    vv = T.axis.spatial(vocab_size, (fused_s_v_0 * 1024 + fused_s_v_1) % vocab_size)
                    T.where(fused_s_v_0 * 1024 + fused_s_v_1 < num_seq * vocab_size)
                    T.reads(bitmask[seq_ids[vs], vv // 32], seq_ids[vs], logits[seq_ids[vs], vv])
                    T.writes(logits[seq_ids[vs], vv])
                    logits[seq_ids[vs], vv] = T.if_then_else(T.bitwise_and(T.shift_right(bitmask[seq_ids[vs], vv // 32], vv % 32), 1) == 1, logits[seq_ids[vs], vv], T.float32(-3.4028234663852886e+38))

    @T.prim_func
    def apply_logit_bias_inplace(var_logits: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_logit_bias: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True)
        logits = T.match_buffer(var_logits, (batch_size, vocab_size))
        num_token = T.int32(is_size_var=True)
        pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32")
        token_ids = T.match_buffer(var_token_ids, (num_token,), "int32")
        logit_bias = T.match_buffer(var_logit_bias, (num_token,))
        # with T.block("root"):
        for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"):
            for p1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("block"):
                    vp = T.axis.spatial(num_token, p0 * 1024 + p1)
                    T.where(p0 * 1024 + p1 < num_token)
                    T.reads(logits[pos2seq_id[vp], token_ids[vp]], pos2seq_id[vp], token_ids[vp], logit_bias[vp])
                    T.writes(logits[pos2seq_id[vp], token_ids[vp]])
                    logits[pos2seq_id[vp], token_ids[vp]] = logits[pos2seq_id[vp], token_ids[vp]] + logit_bias[vp]

    @T.prim_func
    def apply_penalty_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_token_cnt: T.handle, var_penalties: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True)
        logits = T.match_buffer(var_logits, (batch_size, vocab_size))
        num_seq = T.int32(is_size_var=True)
        seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32")
        num_token = T.int32(is_size_var=True)
        pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32")
        token_ids = T.match_buffer(var_token_ids, (num_token,), "int32")
        token_cnt = T.match_buffer(var_token_cnt, (num_token,), "int32")
        penalties = T.match_buffer(var_penalties, (num_seq, 3))
        # with T.block("root"):
        for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"):
            for p1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("block"):
                    vp = T.axis.spatial(num_token, p0 * 1024 + p1)
                    T.where(p0 * 1024 + p1 < num_token)
                    T.reads(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]], seq_ids[pos2seq_id[vp]], pos2seq_id[vp], token_ids[vp], penalties[pos2seq_id[vp], 0:3], token_cnt[vp])
                    T.writes(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]])
                    logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] - (penalties[pos2seq_id[vp], 0] + T.Cast("float32", token_cnt[vp]) * penalties[pos2seq_id[vp], 1])
                    logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = T.if_then_else(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] > T.float32(0), logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] * penalties[pos2seq_id[vp], 2], logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] / penalties[pos2seq_id[vp], 2])

    @T.prim_func
    def argsort_thrust(var_probs: T.handle, var_lv: T.handle, var_topk_gpu_v1: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int64(), T.int64()
        data_buf = T.match_buffer(var_probs, (batch_size, vocab_size), align=8)
        workspace_buf = T.match_buffer(var_lv, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8)
        indices_buf = T.match_buffer(var_topk_gpu_v1, (batch_size, vocab_size), "int32", align=8)
        # with T.block("root"):
        value_buf = T.alloc_buffer((batch_size, vocab_size), align=8)
        with T.block("topk_gpu"):
            T.reads()
            T.writes()
            T.call_packed("tvm.contrib.thrust.sort", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(value_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(indices_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, 0, T.int64(0)), 0, T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0)))

    @T.prim_func
    def batch_decode_paged_kv(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        B = T.int32(is_size_var=True)
        Q = T.match_buffer(Q_handle, (B, 20, 64), "float16")
        max_num_pages = T.int32(is_size_var=True)
        pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16")
        page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1)
        nnz_pages = T.int32(is_size_var=True)
        page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1)
        length_info = T.match_buffer(var_length_info, (B,), "int32", offset_factor=1)
        k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1)
        output = T.match_buffer(output_handle, (B, 20, 64), "float16")
        lse = T.match_buffer(lse_handle, (B, 20))
        # with T.block("root"):
        sm_scale: T.float32 = T.float32(0.18033688011112042)
        for bx in T.thread_binding(B, thread="blockIdx.x"):
            for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"):
                for ty in T.thread_binding(1, thread="threadIdx.y"):
                    for tx in T.thread_binding(16, thread="threadIdx.x"):
                        for tz in T.thread_binding(32, thread="threadIdx.z"):
                            with T.block("attn"):
                                T.reads(page_table_indptr[bx:bx + 2], length_info[bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68])
                                T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty])
                                Q_local = T.alloc_buffer((4,), "float16", scope="local")
                                kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                                K_smem = T.alloc_buffer((64, 64), "float16", scope="shared")
                                V_smem = T.alloc_buffer((64, 64), "float16", scope="shared")
                                O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared")
                                md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared")
                                S_reduce_local = T.alloc_buffer((1,), scope="local")
                                t0 = T.alloc_buffer((1,), scope="local")
                                S_local = T.alloc_buffer((2,), scope="local")
                                QK_local = T.alloc_buffer((4,), scope="local")
                                V_local = T.alloc_buffer((4,), "float16", scope="local")
                                m_prev = T.alloc_buffer((1,), scope="local")
                                d_prev = T.alloc_buffer((1,), scope="local")
                                other_m = T.alloc_buffer((1,), scope="local")
                                other_d = T.alloc_buffer((1,), scope="local")
                                exp_mprev = T.alloc_buffer((1,), scope="local")
                                exp_otherm = T.alloc_buffer((1,), scope="local")
                                other_o = T.alloc_buffer((4,), scope="local")
                                st_m = T.alloc_buffer((1,), scope="local")
                                st_d = T.alloc_buffer((1,), scope="local")
                                O_local = T.alloc_buffer((4,), scope="local")
                                by: T.int32 = fused_by_bz % 20
                                bz: T.int32 = fused_by_bz // 20
                                batch_idx: T.int32 = bx
                                cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx]
                                cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1]
                                kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[batch_idx], 0)
                                st_m[0] = T.float32(-50000)
                                st_d[0] = T.float32(1)
                                for vec in T.vectorized(4):
                                    O_local[vec] = T.float32(0)
                                for vec in T.vectorized(4):
                                    Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec])
                                for iterator in range((kv_chunk_len[0] + 63) // 64):
                                    tile_start_s: T.int32 = (tz + ty) * 2
                                    tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2
                                    for j in range(2):
                                        with T.block("KV_load"):
                                            T.reads()
                                            T.writes()
                                            row_g: T.int32 = tile_start_g + j
                                            if row_g < kv_chunk_len[0]:
                                                seq_offset: T.int32 = row_g
                                                page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16]
                                                page_offset: T.int32 = seq_offset % 16
                                                for vec in T.vectorized(4):
                                                    K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec])
                                                    V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec]
                                            else:
                                                for vec in T.vectorized(4):
                                                    K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0)
                                                    V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    m_prev[0] = st_m[0]
                                    for j in range(2):
                                        for vec in T.vectorized(4):
                                            QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale
                                        S_reduce_local[0] = T.float32(0)
                                        for vec in T.unroll(4):
                                            S_reduce_local[0] = S_reduce_local[0] + QK_local[vec]
                                        with T.block("block_cross_thread"):
                                            T.reads(S_reduce_local[0])
                                            T.writes(t0[0])
                                            T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                            T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx)
                                        S_local[j] = T.float32(-50000)
                                        if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]:
                                            S_local[j] = t0[0]
                                        st_m[0] = T.max(st_m[0], S_local[j])
                                    o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0])
                                    st_d[0] = st_d[0] * o_scale
                                    for j in range(2):
                                        S_local[j] = T.exp2(S_local[j] - st_m[0])
                                        st_d[0] = st_d[0] + S_local[j]
                                    for j in T.vectorized(4):
                                        O_local[j] = O_local[j] * o_scale
                                    for j in range(2):
                                        for vec in T.vectorized(4):
                                            V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec]
                                        for vec in T.vectorized(4):
                                            O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j]
                                for vec in T.vectorized(4):
                                    O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec]
                                md_allreduce[tz, ty, 0] = st_m[0]
                                md_allreduce[tz, ty, 1] = st_d[0]
                                T.tvm_storage_sync("shared")
                                st_m[0] = T.float32(-50000)
                                st_d[0] = T.float32(1)
                                for vec in T.vectorized(4):
                                    O_local[vec] = T.float32(0)
                                for j in range(32):
                                    m_prev[0] = st_m[0]
                                    d_prev[0] = st_d[0]
                                    other_m[0] = md_allreduce[j, ty, 0]
                                    other_d[0] = md_allreduce[j, ty, 1]
                                    for vec in T.vectorized(4):
                                        other_o[vec] = O_allreduce[j, ty, tx * 4 + vec]
                                    st_m[0] = T.max(st_m[0], other_m[0])
                                    st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0])
                                    exp_mprev[0] = T.exp2(m_prev[0] - st_m[0])
                                    exp_otherm[0] = T.exp2(other_m[0] - st_m[0])
                                    for vec in T.vectorized(4):
                                        O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0]
                                for vec in T.vectorized(4):
                                    O_local[vec] = O_local[vec] / st_d[0]
                                for vec in T.vectorized(4):
                                    output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec])
                                lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0])

    @T.prim_func
    def batch_decode_paged_kv_sliding_window(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        B = T.int32(is_size_var=True)
        Q = T.match_buffer(Q_handle, (B, 20, 64), "float16")
        max_num_pages = T.int32(is_size_var=True)
        pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16")
        page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1)
        nnz_pages = T.int32(is_size_var=True)
        page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1)
        length_info = T.match_buffer(var_length_info, (3, B), "int32", offset_factor=1)
        k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1)
        output = T.match_buffer(output_handle, (B, 20, 64), "float16")
        lse = T.match_buffer(lse_handle, (B, 20))
        # with T.block("root"):
        sm_scale: T.float32 = T.float32(0.18033688011112042)
        for bx in T.thread_binding(B, thread="blockIdx.x"):
            for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"):
                for ty in T.thread_binding(1, thread="threadIdx.y"):
                    for tx in T.thread_binding(16, thread="threadIdx.x"):
                        for tz in T.thread_binding(32, thread="threadIdx.z"):
                            with T.block("attn"):
                                T.reads(page_table_indptr[bx:bx + 2], length_info[0:3, bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68])
                                T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty])
                                Q_local = T.alloc_buffer((4,), "float16", scope="local")
                                kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                                K_smem = T.alloc_buffer((64, 64), "float16", scope="shared")
                                V_smem = T.alloc_buffer((64, 64), "float16", scope="shared")
                                O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared")
                                md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared")
                                S_reduce_local = T.alloc_buffer((1,), scope="local")
                                t0 = T.alloc_buffer((1,), scope="local")
                                S_local = T.alloc_buffer((2,), scope="local")
                                QK_local = T.alloc_buffer((4,), scope="local")
                                V_local = T.alloc_buffer((4,), "float16", scope="local")
                                m_prev = T.alloc_buffer((1,), scope="local")
                                d_prev = T.alloc_buffer((1,), scope="local")
                                other_m = T.alloc_buffer((1,), scope="local")
                                other_d = T.alloc_buffer((1,), scope="local")
                                exp_mprev = T.alloc_buffer((1,), scope="local")
                                exp_otherm = T.alloc_buffer((1,), scope="local")
                                other_o = T.alloc_buffer((4,), scope="local")
                                st_m = T.alloc_buffer((1,), scope="local")
                                st_d = T.alloc_buffer((1,), scope="local")
                                O_local = T.alloc_buffer((4,), scope="local")
                                by: T.int32 = fused_by_bz % 20
                                bz: T.int32 = fused_by_bz // 20
                                batch_idx: T.int32 = bx
                                cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx]
                                cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1]
                                kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, batch_idx] - length_info[1, batch_idx] + length_info[2, batch_idx], 0)
                                st_m[0] = T.float32(-50000)
                                st_d[0] = T.float32(1)
                                for vec in T.vectorized(4):
                                    O_local[vec] = T.float32(0)
                                for vec in T.vectorized(4):
                                    Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec])
                                for iterator in range((kv_chunk_len[0] + 63) // 64):
                                    tile_start_s: T.int32 = (tz + ty) * 2
                                    tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2
                                    for j in range(2):
                                        with T.block("KV_load"):
                                            T.reads()
                                            T.writes()
                                            row_g: T.int32 = tile_start_g + j
                                            if row_g < kv_chunk_len[0]:
                                                seq_offset: T.int32 = T.if_then_else(row_g < length_info[2, batch_idx], row_g, row_g - length_info[2, batch_idx] + length_info[1, batch_idx])
                                                page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16]
                                                page_offset: T.int32 = seq_offset % 16
                                                for vec in T.vectorized(4):
                                                    K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec])
                                                    V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec]
                                            else:
                                                for vec in T.vectorized(4):
                                                    K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0)
                                                    V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    m_prev[0] = st_m[0]
                                    for j in range(2):
                                        for vec in T.vectorized(4):
                                            QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale
                                        S_reduce_local[0] = T.float32(0)
                                        for vec in T.unroll(4):
                                            S_reduce_local[0] = S_reduce_local[0] + QK_local[vec]
                                        with T.block("block_cross_thread"):
                                            T.reads(S_reduce_local[0])
                                            T.writes(t0[0])
                                            T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                            T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx)
                                        S_local[j] = T.float32(-50000)
                                        if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]:
                                            S_local[j] = t0[0]
                                        st_m[0] = T.max(st_m[0], S_local[j])
                                    o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0])
                                    st_d[0] = st_d[0] * o_scale
                                    for j in range(2):
                                        S_local[j] = T.exp2(S_local[j] - st_m[0])
                                        st_d[0] = st_d[0] + S_local[j]
                                    for j in T.vectorized(4):
                                        O_local[j] = O_local[j] * o_scale
                                    for j in range(2):
                                        for vec in T.vectorized(4):
                                            V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec]
                                        for vec in T.vectorized(4):
                                            O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j]
                                for vec in T.vectorized(4):
                                    O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec]
                                md_allreduce[tz, ty, 0] = st_m[0]
                                md_allreduce[tz, ty, 1] = st_d[0]
                                T.tvm_storage_sync("shared")
                                st_m[0] = T.float32(-50000)
                                st_d[0] = T.float32(1)
                                for vec in T.vectorized(4):
                                    O_local[vec] = T.float32(0)
                                for j in range(32):
                                    m_prev[0] = st_m[0]
                                    d_prev[0] = st_d[0]
                                    other_m[0] = md_allreduce[j, ty, 0]
                                    other_d[0] = md_allreduce[j, ty, 1]
                                    for vec in T.vectorized(4):
                                        other_o[vec] = O_allreduce[j, ty, tx * 4 + vec]
                                    st_m[0] = T.max(st_m[0], other_m[0])
                                    st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0])
                                    exp_mprev[0] = T.exp2(m_prev[0] - st_m[0])
                                    exp_otherm[0] = T.exp2(other_m[0] - st_m[0])
                                    for vec in T.vectorized(4):
                                        O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0]
                                for vec in T.vectorized(4):
                                    O_local[vec] = O_local[vec] / st_d[0]
                                for vec in T.vectorized(4):
                                    output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec])
                                lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0])

    @T.prim_func
    def batch_prefill_paged_kv(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        total_len = T.int32(is_size_var=True)
        q = T.match_buffer(var_q, (total_len, 20, 64), "float16")
        batch_size = T.int32(is_size_var=True)
        q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1)
        max_num_pages = T.int32(is_size_var=True)
        pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16")
        page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1)
        nnz_pages = T.int32(is_size_var=True)
        page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1)
        length_info = T.match_buffer(var_length_info, (batch_size,), "int32", offset_factor=1)
        k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1)
        output = T.match_buffer(var_output, (total_len, 20, 64), "float16")
        lse = T.match_buffer(var_lse, (total_len, 20))
        # with T.block("root"):
        for lbx in T.thread_binding(16, thread="blockIdx.x"):
            for lby in T.thread_binding(20, thread="blockIdx.y"):
                for lty in T.thread_binding(4, thread="threadIdx.y"):
                    for ltx in T.thread_binding(32, thread="threadIdx.x"):
                        with T.block("attn"):
                            bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx])
                            T.reads()
                            T.writes()
                            tile_id = T.alloc_buffer((1,), "int32", scope="local")
                            batch_idx = T.alloc_buffer((1,), "int32", scope="local")
                            batch_tiles = T.alloc_buffer((1,), "int32", scope="local")
                            batch_rows = T.alloc_buffer((1,), "int32", scope="local")
                            iterator = T.alloc_buffer((1,), "int32", scope="local")
                            kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                            Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared")
                            K_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            V_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            S_smem = T.alloc_buffer((32, 16), scope="shared")
                            S_local = T.alloc_buffer((32, 16), scope="local")
                            O_local = T.alloc_buffer((32, 64), scope="local")
                            m_smem = T.alloc_buffer((32,), scope="shared")
                            m_prev_smem = T.alloc_buffer((32,), scope="shared")
                            d_smem = T.alloc_buffer((32,), scope="shared")
                            m_new = T.alloc_buffer((1,), scope="local")
                            m_prev = T.alloc_buffer((1,), scope="local")
                            d_new = T.alloc_buffer((1,), scope="local")
                            tile_id[0] = bx
                            batch_idx[0] = 0
                            batch_rows[0] = q_indptr[1] - q_indptr[0]
                            batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                            while T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size:
                                    tile_id[0] = tile_id[0] - batch_tiles[0]
                                    batch_idx[0] = batch_idx[0] + 1
                                    if batch_idx[0] < batch_size:
                                        b_idx: T.int32 = batch_idx[0]
                                        batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx]
                                        batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                                if T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                    b_idx: T.int32 = batch_idx[0]
                                    LH_start: T.int32 = tile_id[0] * 32
                                    q_indptr_val: T.int32 = q_indptr[b_idx]
                                    cur_page_indptr_begin: T.int32 = page_indptr[b_idx]
                                    cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1]
                                    kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[b_idx], 0)
                                    T.tvm_storage_sync("shared")
                                    for i in range(1):
                                        row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                        if row < 32:
                                            m_smem[row] = T.float32(-50000)
                                            d_smem[row] = T.float32(1)
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_init"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads()
                                                    T.writes(O_local[i, j])
                                                    O_local[i, j] = T.float32(0)
                                    T.tvm_storage_sync("shared")
                                    for li_lj_fused_0 in range(4):
                                        for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_lj_fused_3 in T.vectorized(4):
                                                    with T.block("Q_load"):
                                                        i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64)
                                                        j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64)
                                                        T.reads()
                                                        T.writes()
                                                        cur_L: T.int32 = q_indptr_val + (LH_start + i)
                                                        cur_H_qo: T.int32 = by
                                                        if cur_L < q_indptr[b_idx + 1]:
                                                            Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j])
                                                        else:
                                                            Q_smem[i, j] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    for iterator_1 in range((kv_chunk_len[0] + 15) // 16):
                                        L_kv_start: T.int32 = iterator_1 * 16
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("K_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                seq_offset: T.int32 = cur_L
                                                                page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16]
                                                                page_offset: T.int32 = seq_offset % 16
                                                                K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j])
                                                            else:
                                                                K_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("V_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                seq_offset: T.int32 = cur_L
                                                                page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16]
                                                                page_offset: T.int32 = seq_offset % 16
                                                                V_smem[i, j] = pages[page_no, 1, by, page_offset, j]
                                                            else:
                                                                V_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64])
                                            T.writes(S_local[0:32, 0:16])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(2, 2):
                                                        with T.block("S_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init)
                                                            T.reads()
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = T.float32(0)
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8):
                                                        with T.block("S_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                            k = T.axis.reduce(64, lk_0 * 8 + lk_1)
                                                            T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k])
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042)
                                        T.tvm_storage_sync("shared")
                                        for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_1, lj_1 in T.grid(2, 2):
                                                    with T.block("S_store"):
                                                        i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                        j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                        T.reads(S_local[i, j])
                                                        T.writes(S_smem[i, j])
                                                        S_smem[i, j] = S_local[i, j]
                                        T.tvm_storage_sync("shared")
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update1"):
                                                    T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i])
                                                    T.writes(m_prev[i], m_new[i], d_new[i])
                                                    m_prev[i] = m_smem[row]
                                                    m_new[i] = m_smem[row]
                                                    row_: T.int32 = LH_start + row
                                                    for j in range(16):
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            m_new[i] = T.max(m_new[i], S_smem[row, j])
                                                    d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            with T.block("update"):
                                                T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i])
                                                T.writes(S_smem[row, 0:16])
                                                for j in range(16):
                                                    if row < 32:
                                                        row_: T.int32 = LH_start + row
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i])
                                                        else:
                                                            S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update"):
                                                    T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i])
                                                    T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row])
                                                    for j in range(16):
                                                        d_new[i] = d_new[i] + S_smem[row, j]
                                                    m_smem[row] = m_new[i]
                                                    d_smem[row] = d_new[i]
                                                    m_prev_smem[row] = m_prev[i]
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64])
                                            T.writes(O_local[0:32, 0:64])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(4, 4):
                                                        with T.block("O_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init)
                                                            T.reads()
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i])
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4):
                                                        with T.block("O_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                            k = T.axis.reduce(16, lk_0 * 8 + lk_1)
                                                            T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j])
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j])
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_store"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i])
                                                    T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i])
                                    for li_0 in range(1):
                                        for li_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                with T.block("lse_store"):
                                                    i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2)
                                                    T.where((li_0 * 4 + li_1) * 32 + li_2 < 32)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i])
                                                    T.writes(lse[q_indptr[b_idx] + (LH_start + i), by])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i])
                                    tile_id[0] = tile_id[0] + 16

    @T.prim_func
    def batch_prefill_paged_kv_sliding_window(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        total_len = T.int32(is_size_var=True)
        q = T.match_buffer(var_q, (total_len, 20, 64), "float16")
        batch_size = T.int32(is_size_var=True)
        q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1)
        max_num_pages = T.int32(is_size_var=True)
        pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16")
        page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1)
        nnz_pages = T.int32(is_size_var=True)
        page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1)
        length_info = T.match_buffer(var_length_info, (3, batch_size), "int32", offset_factor=1)
        k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1)
        output = T.match_buffer(var_output, (total_len, 20, 64), "float16")
        lse = T.match_buffer(var_lse, (total_len, 20))
        # with T.block("root"):
        for lbx in T.thread_binding(16, thread="blockIdx.x"):
            for lby in T.thread_binding(20, thread="blockIdx.y"):
                for lty in T.thread_binding(4, thread="threadIdx.y"):
                    for ltx in T.thread_binding(32, thread="threadIdx.x"):
                        with T.block("attn"):
                            bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx])
                            T.reads()
                            T.writes()
                            tile_id = T.alloc_buffer((1,), "int32", scope="local")
                            batch_idx = T.alloc_buffer((1,), "int32", scope="local")
                            batch_tiles = T.alloc_buffer((1,), "int32", scope="local")
                            batch_rows = T.alloc_buffer((1,), "int32", scope="local")
                            iterator = T.alloc_buffer((1,), "int32", scope="local")
                            kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                            Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared")
                            K_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            V_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            S_smem = T.alloc_buffer((32, 16), scope="shared")
                            S_local = T.alloc_buffer((32, 16), scope="local")
                            O_local = T.alloc_buffer((32, 64), scope="local")
                            m_smem = T.alloc_buffer((32,), scope="shared")
                            m_prev_smem = T.alloc_buffer((32,), scope="shared")
                            d_smem = T.alloc_buffer((32,), scope="shared")
                            m_new = T.alloc_buffer((1,), scope="local")
                            m_prev = T.alloc_buffer((1,), scope="local")
                            d_new = T.alloc_buffer((1,), scope="local")
                            tile_id[0] = bx
                            batch_idx[0] = 0
                            batch_rows[0] = q_indptr[1] - q_indptr[0]
                            batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                            while T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size:
                                    tile_id[0] = tile_id[0] - batch_tiles[0]
                                    batch_idx[0] = batch_idx[0] + 1
                                    if batch_idx[0] < batch_size:
                                        b_idx: T.int32 = batch_idx[0]
                                        batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx]
                                        batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                                if T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                    b_idx: T.int32 = batch_idx[0]
                                    LH_start: T.int32 = tile_id[0] * 32
                                    q_indptr_val: T.int32 = q_indptr[b_idx]
                                    cur_page_indptr_begin: T.int32 = page_indptr[b_idx]
                                    cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1]
                                    kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, b_idx] - length_info[1, b_idx] + length_info[2, b_idx], 0)
                                    T.tvm_storage_sync("shared")
                                    for i in range(1):
                                        row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                        if row < 32:
                                            m_smem[row] = T.float32(-50000)
                                            d_smem[row] = T.float32(1)
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_init"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads()
                                                    T.writes(O_local[i, j])
                                                    O_local[i, j] = T.float32(0)
                                    T.tvm_storage_sync("shared")
                                    for li_lj_fused_0 in range(4):
                                        for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_lj_fused_3 in T.vectorized(4):
                                                    with T.block("Q_load"):
                                                        i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64)
                                                        j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64)
                                                        T.reads()
                                                        T.writes()
                                                        cur_L: T.int32 = q_indptr_val + (LH_start + i)
                                                        cur_H_qo: T.int32 = by
                                                        if cur_L < q_indptr[b_idx + 1]:
                                                            Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j])
                                                        else:
                                                            Q_smem[i, j] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    for iterator_1 in range((kv_chunk_len[0] + 15) // 16):
                                        L_kv_start: T.int32 = iterator_1 * 16
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("K_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx])
                                                                page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16]
                                                                page_offset: T.int32 = seq_offset % 16
                                                                K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j])
                                                            else:
                                                                K_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("V_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx])
                                                                page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16]
                                                                page_offset: T.int32 = seq_offset % 16
                                                                V_smem[i, j] = pages[page_no, 1, by, page_offset, j]
                                                            else:
                                                                V_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64])
                                            T.writes(S_local[0:32, 0:16])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(2, 2):
                                                        with T.block("S_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init)
                                                            T.reads()
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = T.float32(0)
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8):
                                                        with T.block("S_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                            k = T.axis.reduce(64, lk_0 * 8 + lk_1)
                                                            T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k])
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042)
                                        T.tvm_storage_sync("shared")
                                        for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_1, lj_1 in T.grid(2, 2):
                                                    with T.block("S_store"):
                                                        i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                        j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                        T.reads(S_local[i, j])
                                                        T.writes(S_smem[i, j])
                                                        S_smem[i, j] = S_local[i, j]
                                        T.tvm_storage_sync("shared")
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update1"):
                                                    T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i])
                                                    T.writes(m_prev[i], m_new[i], d_new[i])
                                                    m_prev[i] = m_smem[row]
                                                    m_new[i] = m_smem[row]
                                                    row_: T.int32 = LH_start + row
                                                    for j in range(16):
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            m_new[i] = T.max(m_new[i], S_smem[row, j])
                                                    d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            with T.block("update"):
                                                T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i])
                                                T.writes(S_smem[row, 0:16])
                                                for j in range(16):
                                                    if row < 32:
                                                        row_: T.int32 = LH_start + row
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i])
                                                        else:
                                                            S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update"):
                                                    T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i])
                                                    T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row])
                                                    for j in range(16):
                                                        d_new[i] = d_new[i] + S_smem[row, j]
                                                    m_smem[row] = m_new[i]
                                                    d_smem[row] = d_new[i]
                                                    m_prev_smem[row] = m_prev[i]
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64])
                                            T.writes(O_local[0:32, 0:64])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(4, 4):
                                                        with T.block("O_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init)
                                                            T.reads()
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i])
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4):
                                                        with T.block("O_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                            k = T.axis.reduce(16, lk_0 * 8 + lk_1)
                                                            T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j])
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j])
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_store"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i])
                                                    T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i])
                                    for li_0 in range(1):
                                        for li_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                with T.block("lse_store"):
                                                    i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2)
                                                    T.where((li_0 * 4 + li_1) * 32 + li_2 < 32)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i])
                                                    T.writes(lse[q_indptr[b_idx] + (LH_start + i), by])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i])
                                    tile_id[0] = tile_id[0] + 16

    @T.prim_func
    def batch_prefill_ragged_kv(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_k_rope_pos_offset: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        qo_len = T.int32(is_size_var=True)
        q = T.match_buffer(var_q, (qo_len, 20, 64), "float16")
        batch_size = T.int32(is_size_var=True)
        q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1)
        kv_len = T.int32(is_size_var=True)
        k = T.match_buffer(var_k, (kv_len, 20, 64), "float16")
        v = T.match_buffer(var_v, (kv_len, 20, 64), "float16")
        kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1)
        k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1)
        output = T.match_buffer(var_output, (qo_len, 20, 64), "float16")
        lse = T.match_buffer(var_lse, (qo_len, 20))
        # with T.block("root"):
        for lbx in T.thread_binding(16, thread="blockIdx.x"):
            for lby in T.thread_binding(20, thread="blockIdx.y"):
                for lty in T.thread_binding(4, thread="threadIdx.y"):
                    for ltx in T.thread_binding(32, thread="threadIdx.x"):
                        with T.block("attn"):
                            bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx])
                            T.reads()
                            T.writes()
                            tile_id = T.alloc_buffer((1,), "int32", scope="local")
                            batch_idx = T.alloc_buffer((1,), "int32", scope="local")
                            batch_tiles = T.alloc_buffer((1,), "int32", scope="local")
                            batch_rows = T.alloc_buffer((1,), "int32", scope="local")
                            iterator = T.alloc_buffer((1,), "int32", scope="local")
                            kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                            Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared")
                            K_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            V_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            S_smem = T.alloc_buffer((32, 16), scope="shared")
                            S_local = T.alloc_buffer((32, 16), scope="local")
                            O_local = T.alloc_buffer((32, 64), scope="local")
                            m_smem = T.alloc_buffer((32,), scope="shared")
                            m_prev_smem = T.alloc_buffer((32,), scope="shared")
                            d_smem = T.alloc_buffer((32,), scope="shared")
                            m_new = T.alloc_buffer((1,), scope="local")
                            m_prev = T.alloc_buffer((1,), scope="local")
                            d_new = T.alloc_buffer((1,), scope="local")
                            tile_id[0] = bx
                            batch_idx[0] = 0
                            batch_rows[0] = q_indptr[1] - q_indptr[0]
                            batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                            while T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size:
                                    tile_id[0] = tile_id[0] - batch_tiles[0]
                                    batch_idx[0] = batch_idx[0] + 1
                                    if batch_idx[0] < batch_size:
                                        b_idx: T.int32 = batch_idx[0]
                                        batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx]
                                        batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                                if T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                    b_idx: T.int32 = batch_idx[0]
                                    q_indptr_val: T.int32 = q_indptr[b_idx]
                                    LH_start: T.int32 = tile_id[0] * 32
                                    kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx]
                                    T.tvm_storage_sync("shared")
                                    for i in range(1):
                                        row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                        if row < 32:
                                            m_smem[row] = T.float32(-50000)
                                            d_smem[row] = T.float32(1)
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_init"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads()
                                                    T.writes(O_local[i, j])
                                                    O_local[i, j] = T.float32(0)
                                    T.tvm_storage_sync("shared")
                                    for li_lj_fused_0 in range(4):
                                        for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_lj_fused_3 in T.vectorized(4):
                                                    with T.block("Q_load"):
                                                        i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64)
                                                        j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64)
                                                        T.reads()
                                                        T.writes()
                                                        cur_L: T.int32 = q_indptr_val + (LH_start + i)
                                                        cur_H_qo: T.int32 = by
                                                        if cur_L < q_indptr[b_idx + 1]:
                                                            Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j])
                                                        else:
                                                            Q_smem[i, j] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    for iterator_1 in range((kv_chunk_len[0] + 15) // 16):
                                        L_kv_start: T.int32 = iterator_1 * 16
                                        L_kv_base: T.int32 = kv_indptr[b_idx]
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("K_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", k[L_kv_base + cur_L, by, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, k[L_kv_base + cur_L, by, j + 32] * T.float16(-1), k[L_kv_base + cur_L, by, j - 32]))), k[L_kv_base + cur_L, by, j])
                                                            else:
                                                                K_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("V_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_start + i
                                                            if cur_L < kv_chunk_len[0]:
                                                                V_smem[i, j] = v[L_kv_base + cur_L, by, j]
                                                            else:
                                                                V_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64])
                                            T.writes(S_local[0:32, 0:16])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(2, 2):
                                                        with T.block("S_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init)
                                                            T.reads()
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = T.float32(0)
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8):
                                                        with T.block("S_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                            k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1)
                                                            T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1])
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042)
                                        T.tvm_storage_sync("shared")
                                        for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_1, lj_1 in T.grid(2, 2):
                                                    with T.block("S_store"):
                                                        i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                        j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                        T.reads(S_local[i, j])
                                                        T.writes(S_smem[i, j])
                                                        S_smem[i, j] = S_local[i, j]
                                        T.tvm_storage_sync("shared")
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update1"):
                                                    T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i])
                                                    T.writes(m_prev[i], m_new[i], d_new[i])
                                                    m_prev[i] = m_smem[row]
                                                    m_new[i] = m_smem[row]
                                                    row_: T.int32 = LH_start + row
                                                    for j in range(16):
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            m_new[i] = T.max(m_new[i], S_smem[row, j])
                                                    d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            with T.block("update"):
                                                T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i])
                                                T.writes(S_smem[row, 0:16])
                                                for j in range(16):
                                                    if row < 32:
                                                        row_: T.int32 = LH_start + row
                                                        if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]):
                                                            S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i])
                                                        else:
                                                            S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update"):
                                                    T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i])
                                                    T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row])
                                                    for j in range(16):
                                                        d_new[i] = d_new[i] + S_smem[row, j]
                                                    m_smem[row] = m_new[i]
                                                    d_smem[row] = d_new[i]
                                                    m_prev_smem[row] = m_prev[i]
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64])
                                            T.writes(O_local[0:32, 0:64])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(4, 4):
                                                        with T.block("O_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init)
                                                            T.reads()
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i])
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4):
                                                        with T.block("O_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                            k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1)
                                                            T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j])
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j])
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_store"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i])
                                                    T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i])
                                    for li_0 in range(1):
                                        for li_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                with T.block("lse_store"):
                                                    i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2)
                                                    T.where((li_0 * 4 + li_1) * 32 + li_2 < 32)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i])
                                                    T.writes(lse[q_indptr[b_idx] + (LH_start + i), by])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i])
                                    tile_id[0] = tile_id[0] + 16

    @T.prim_func
    def batch_tree_attn(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_mn_indptr: T.handle, var_mask: T.handle, var_output: T.handle, var_lse: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32, batch_size: T.int32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        qo_len = T.int32(is_size_var=True)
        q = T.match_buffer(var_q, (qo_len, 20, 64), "float16")
        q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1)
        kv_len = T.int32(is_size_var=True)
        k = T.match_buffer(var_k, (kv_len, 20, 64), "float16")
        v = T.match_buffer(var_v, (kv_len, 20, 64), "float16")
        kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1)
        q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1)
        mn_indptr = T.match_buffer(var_mn_indptr, (batch_size + 1,), "int32", offset_factor=1)
        tree_size = T.int32(is_size_var=True)
        mask = T.match_buffer(var_mask, (tree_size,), "int32", offset_factor=1)
        output = T.match_buffer(var_output, (qo_len, 20, 64), "float16")
        lse = T.match_buffer(var_lse, (qo_len, 20))
        # with T.block("root"):
        for lbx in T.thread_binding(16, thread="blockIdx.x"):
            for lby in T.thread_binding(20, thread="blockIdx.y"):
                for lty in T.thread_binding(4, thread="threadIdx.y"):
                    for ltx in T.thread_binding(32, thread="threadIdx.x"):
                        with T.block("attn"):
                            bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx])
                            T.reads()
                            T.writes()
                            tile_id = T.alloc_buffer((1,), "int32", scope="local")
                            batch_idx = T.alloc_buffer((1,), "int32", scope="local")
                            batch_tiles = T.alloc_buffer((1,), "int32", scope="local")
                            batch_rows = T.alloc_buffer((1,), "int32", scope="local")
                            iterator = T.alloc_buffer((1,), "int32", scope="local")
                            kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local")
                            Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared")
                            K_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            V_smem = T.alloc_buffer((16, 64), "float16", scope="shared")
                            S_smem = T.alloc_buffer((32, 16), scope="shared")
                            S_local = T.alloc_buffer((32, 16), scope="local")
                            O_local = T.alloc_buffer((32, 64), scope="local")
                            m_smem = T.alloc_buffer((32,), scope="shared")
                            m_prev_smem = T.alloc_buffer((32,), scope="shared")
                            d_smem = T.alloc_buffer((32,), scope="shared")
                            m_new = T.alloc_buffer((1,), scope="local")
                            m_prev = T.alloc_buffer((1,), scope="local")
                            d_new = T.alloc_buffer((1,), scope="local")
                            tile_id[0] = bx
                            batch_idx[0] = 0
                            batch_rows[0] = q_indptr[1] - q_indptr[0]
                            batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                            while T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size:
                                    tile_id[0] = tile_id[0] - batch_tiles[0]
                                    batch_idx[0] = batch_idx[0] + 1
                                    if batch_idx[0] < batch_size:
                                        b_idx: T.int32 = batch_idx[0]
                                        batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx]
                                        batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32
                                if T.tvm_thread_invariant(batch_idx[0] < batch_size):
                                    b_idx: T.int32 = batch_idx[0]
                                    LH_start: T.int32 = tile_id[0] * 32
                                    q_indptr_val: T.int32 = q_indptr[b_idx]
                                    kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx]
                                    T.tvm_storage_sync("shared")
                                    for i in range(1):
                                        row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                        if row < 32:
                                            m_smem[row] = T.float32(-50000)
                                            d_smem[row] = T.float32(1)
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_init"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads()
                                                    T.writes(O_local[i, j])
                                                    O_local[i, j] = T.float32(0)
                                    T.tvm_storage_sync("shared")
                                    for li_lj_fused_0 in range(4):
                                        for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_lj_fused_3 in T.vectorized(4):
                                                    with T.block("Q_load"):
                                                        i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64)
                                                        j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64)
                                                        T.reads()
                                                        T.writes()
                                                        cur_L: T.int32 = q_indptr_val + (LH_start + i)
                                                        cur_H_qo: T.int32 = by
                                                        if cur_L < q_indptr[b_idx + 1]:
                                                            Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * q[cur_L, cur_H_qo, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]), q[cur_L, cur_H_qo, j])
                                                        else:
                                                            Q_smem[i, j] = T.float16(0)
                                    T.tvm_storage_sync("shared")
                                    for iterator_1 in range((kv_chunk_len[0] + 15) // 16):
                                        L_kv_start: T.int32 = iterator_1 * 16
                                        L_kv_base: T.int32 = kv_indptr[b_idx]
                                        for lz_ly_fused_0 in range(2):
                                            for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"):
                                                for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lz_ly_fused_3 in T.vectorized(4):
                                                        with T.block("KV_load"):
                                                            i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64)
                                                            j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64)
                                                            T.reads()
                                                            T.writes()
                                                            cur_L: T.int32 = L_kv_base + L_kv_start + i
                                                            if L_kv_start + i < kv_chunk_len[0]:
                                                                K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * k[cur_L, by, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, k[cur_L, by, j + 32] * T.float16(-1), k[cur_L, by, j - 32]), k[cur_L, by, j])
                                                                V_smem[i, j] = v[cur_L, by, j]
                                                            else:
                                                                K_smem[i, j] = T.float16(0)
                                                                V_smem[i, j] = T.float16(0)
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64])
                                            T.writes(S_local[0:32, 0:16])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(2, 2):
                                                        with T.block("S_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init)
                                                            T.reads()
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = T.float32(0)
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8):
                                                        with T.block("S_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                            j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                            k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1)
                                                            T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1])
                                                            T.writes(S_local[i, j])
                                                            S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042)
                                        T.tvm_storage_sync("shared")
                                        for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                for li_1, lj_1 in T.grid(2, 2):
                                                    with T.block("S_store"):
                                                        i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1)
                                                        j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1)
                                                        T.reads(S_local[i, j])
                                                        T.writes(S_smem[i, j])
                                                        S_smem[i, j] = S_local[i, j]
                                        T.tvm_storage_sync("shared")
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update1"):
                                                    T.reads(m_smem[row], kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i])
                                                    T.writes(m_prev[i], m_new[i], d_new[i])
                                                    m_prev[i] = m_smem[row]
                                                    m_new[i] = m_smem[row]
                                                    row_: T.int32 = LH_start + row
                                                    for j in range(16):
                                                        if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1:
                                                            m_new[i] = T.max(m_new[i], S_smem[row, j])
                                                    d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            with T.block("update"):
                                                T.reads(kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i])
                                                T.writes(S_smem[row, 0:16])
                                                for j in range(16):
                                                    if row < 32:
                                                        row_: T.int32 = LH_start + row
                                                        if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1:
                                                            S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i])
                                                        else:
                                                            S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i])
                                        for i in range(1):
                                            row: T.int32 = i * 32 * 4 + ty * 32 + tx
                                            if row < 32:
                                                with T.block("update"):
                                                    T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i])
                                                    T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row])
                                                    for j in range(16):
                                                        d_new[i] = d_new[i] + S_smem[row, j]
                                                    m_smem[row] = m_new[i]
                                                    d_smem[row] = d_new[i]
                                                    m_prev_smem[row] = m_prev[i]
                                        T.tvm_storage_sync("shared")
                                        with T.block(""):
                                            T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64])
                                            T.writes(O_local[0:32, 0:64])
                                            for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"):
                                                    for li_1_init, lj_1_init in T.grid(4, 4):
                                                        with T.block("O_gemm_init"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init)
                                                            T.reads()
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i])
                                            for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                                for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                                    for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4):
                                                        with T.block("O_gemm_update"):
                                                            i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                            j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                            k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1)
                                                            T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j])
                                                            T.writes(O_local[i, j])
                                                            O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j])
                                    for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"):
                                        for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
                                            for li_1, lj_1 in T.grid(4, 4):
                                                with T.block("O_store"):
                                                    i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1)
                                                    j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i])
                                                    T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i])
                                    for li_0 in range(1):
                                        for li_1 in T.thread_binding(4, thread="threadIdx.y"):
                                            for li_2 in T.thread_binding(32, thread="threadIdx.x"):
                                                with T.block("lse_store"):
                                                    i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2)
                                                    T.where((li_0 * 4 + li_1) * 32 + li_2 < 32)
                                                    T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i])
                                                    T.writes(lse[q_indptr[b_idx] + (LH_start + i), by])
                                                    cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i)
                                                    cur_H_qo: T.int32 = by
                                                    if cur_L < q_indptr[b_idx + 1]:
                                                        lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i])
                                    tile_id[0] = tile_id[0] + 16

    @T.prim_func
    def batch_verify_on_gpu_single_kernel(var_draft_probs: T.handle, var_draft_tokens: T.handle, var_model_probs: T.handle, var_token_tree_first_child: T.handle, var_token_tree_next_sibling: T.handle, var_uniform_samples: T.handle, var_token_tree_parent_ptr: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        num_nodes, vocab_size = T.int32(is_size_var=True), T.int64()
        draft_probs = T.match_buffer(var_draft_probs, (num_nodes, vocab_size))
        draft_tokens = T.match_buffer(var_draft_tokens, (num_nodes,), "int32")
        model_probs = T.match_buffer(var_model_probs, (num_nodes, vocab_size))
        token_tree_first_child = T.match_buffer(var_token_tree_first_child, (num_nodes,), "int32")
        token_tree_next_sibling = T.match_buffer(var_token_tree_next_sibling, (num_nodes,), "int32")
        uniform_samples = T.match_buffer(var_uniform_samples, (num_nodes,))
        nbatch = T.int32(is_size_var=True)
        token_tree_parent_ptr = T.match_buffer(var_token_tree_parent_ptr, (nbatch,), "int32")
        # with T.block("root"):
        child_ptr = T.alloc_buffer((1,), "int32", scope="local")
        parent_ptr = T.alloc_buffer((1,), "int32", scope="local")
        child_token = T.alloc_buffer((1,), "int32", scope="local")
        done = T.alloc_buffer((1,), "bool", scope="local")
        psum = T.alloc_buffer((1,), scope="local")
        t0 = T.alloc_buffer((1,), scope="local")
        model_prob_local = T.alloc_buffer((1,), scope="local")
        draft_prob_local = T.alloc_buffer((1,), scope="local")
        p_child = T.alloc_buffer((1,), scope="local")
        q_child = T.alloc_buffer((1,), scope="local")
        uniform_sample = T.alloc_buffer((1,), scope="local")
        pred_shared = T.alloc_buffer((1,), "bool", scope="shared")
        pred_local = T.alloc_buffer((1,), "bool", scope="local")
        for _bx in T.thread_binding(nbatch, thread="blockIdx.x"):
            for _tx in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("CTA"):
                    b, tx = T.axis.remap("SS", [_bx, _tx])
                    T.reads(token_tree_parent_ptr[b], token_tree_first_child[T.min(parent_ptr[0], child_ptr[0]):T.min(parent_ptr[0], child_ptr[0]) + (T.max(parent_ptr[0], child_ptr[0]) + 1 - T.min(parent_ptr[0], child_ptr[0]))], parent_ptr[0], done[0], child_ptr[0], draft_tokens[child_ptr[0]], model_probs[parent_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], child_token[0], draft_probs[child_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], uniform_samples[child_ptr[0]], p_child[0], uniform_sample[0], q_child[0], pred_shared[0], pred_local[0], model_prob_local[0], draft_prob_local[0], psum[0], t0[0], token_tree_next_sibling[child_ptr[0]])
                    T.writes(parent_ptr[0], child_ptr[0], done[0], child_token[0], p_child[0], q_child[0], uniform_sample[0], pred_shared[0], pred_local[0], psum[0], model_prob_local[0], draft_prob_local[0], t0[0], model_probs[parent_ptr[0], T.Cast("int64", tx):T.Cast("int64", tx) + ((vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) - T.int64(1023))], token_tree_parent_ptr[b])
                    parent_ptr[0] = token_tree_parent_ptr[b]
                    child_ptr[0] = token_tree_first_child[parent_ptr[0]]
                    done[0] = T.bool(False)
                    while not done[0]:
                        T.tvm_storage_sync("shared")
                        if child_ptr[0] == -1:
                            done[0] = T.bool(True)
                            T.tvm_storage_sync("shared")
                        else:
                            if tx == 0:
                                child_token[0] = draft_tokens[child_ptr[0]]
                                p_child[0] = model_probs[parent_ptr[0], child_token[0]]
                                q_child[0] = draft_probs[child_ptr[0], child_token[0]]
                                uniform_sample[0] = uniform_samples[child_ptr[0]]
                                pred_shared[0] = p_child[0] >= uniform_sample[0] * q_child[0]
                            T.tvm_storage_sync("shared")
                            pred_local[0] = pred_shared[0]
                            if pred_local[0]:
                                parent_ptr[0] = child_ptr[0]
                                child_ptr[0] = token_tree_first_child[child_ptr[0]]
                            else:
                                psum[0] = T.float32(0)
                                for i in range((vocab_size + T.int64(1023)) // T.int64(1024)):
                                    if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size:
                                        model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)]
                                        draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)]
                                        model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0))
                                        psum[0] = psum[0] + model_prob_local[0]
                                with T.block("block_cross_thread"):
                                    T.reads(psum[0])
                                    T.writes(t0[0])
                                    T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                    T.tvm_thread_allreduce(T.uint32(1), psum[0], T.bool(True), t0[0], tx)
                                if t0[0] < T.float32(9.9999999999999995e-08):
                                    parent_ptr[0] = child_ptr[0]
                                    child_ptr[0] = token_tree_first_child[child_ptr[0]]
                                else:
                                    for i in range((vocab_size + T.int64(1023)) // T.int64(1024)):
                                        if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size:
                                            model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)]
                                            draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)]
                                            model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0))
                                            model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] = model_prob_local[0] / t0[0]
                                    child_ptr[0] = token_tree_next_sibling[child_ptr[0]]
                    if tx == 0:
                        token_tree_parent_ptr[b] = parent_ptr[0]

    @T.prim_func
    def chunk_lse(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
        A = T.match_buffer(var_A, (batch_size, vocab_size))
        temperature = T.match_buffer(var_temperature, (batch_size,))
        num_chunks = T.int64(is_size_var=True)
        chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks))
        chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks))
        # with T.block("root"):
        temp_max_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared")
        temp_sum_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared")
        for ax0_ax1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"):
            for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("max"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0)
                            v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1)
                            v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1)
                            T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2])
                            T.writes(temp_max_shared[v0, v1])
                            with T.init():
                                temp_max_shared[v0, v1] = T.float32(-3.4028234663852886e+38)
                            temp_max_shared[v0, v1] = T.max(temp_max_shared[v0, v1], T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)))
            for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("sum_exp"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0)
                            v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1)
                            v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1)
                            T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_max_shared[v0, v1])
                            T.writes(temp_sum_shared[v0, v1])
                            with T.init():
                                temp_sum_shared[v0, v1] = T.float32(0)
                            temp_sum_shared[v0, v1] = temp_sum_shared[v0, v1] + T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) - temp_max_shared[v0, v1]), T.Cast("float32", T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) == temp_max_shared[v0, v1])), T.float32(0))
            for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                for ax2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                    with T.block("log"):
                        v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks)
                        v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks)
                        v2 = T.axis.spatial(T.int64(1), ax2_0 * T.int64(256) + ax2_1)
                        T.where(ax2_0 * T.int64(256) + ax2_1 < T.int64(1))
                        T.reads(temperature[v0], temp_sum_shared[v0, v1], temp_max_shared[v0, v1])
                        T.writes(chunked_sum[v0, v1], chunked_max[v0, v1])
                        chunked_sum[v0, v1] = T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.log(temp_sum_shared[v0, v1]), temp_sum_shared[v0, v1])
                        chunked_max[v0, v1] = temp_max_shared[v0, v1]

    @T.prim_func
    def compact_kv_copy(var_pages: T.handle, var_copy_length_indptr: T.handle, var_copy_src_dst_pos: T.handle, batch_size: T.int32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        num_pages = T.int32()
        pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16")
        copy_length_indptr = T.match_buffer(var_copy_length_indptr, (batch_size + 1,), "int32", offset_factor=1)
        total_copy_length = T.int32()
        copy_src_dst_pos = T.match_buffer(var_copy_src_dst_pos, (2, total_copy_length), "int32", offset_factor=1)
        with T.block("root"):
            T.reads()
            T.writes()
            for bhd_o in T.thread_binding((batch_size * 1280 + 1023) // 1024, thread="blockIdx.x"):
                for bhd_i in T.thread_binding(1024, thread="threadIdx.x"):
                    b: T.int32 = (bhd_o * 1024 + bhd_i) // 1280
                    h: T.int32 = (bhd_o * 1024 + bhd_i) // 64 % 20
                    d: T.int32 = (bhd_o * 1024 + bhd_i) % 64
                    if bhd_o * 1024 + bhd_i < batch_size * 20 * 64:
                        for i in range(copy_length_indptr[b + 1] - copy_length_indptr[b]):
                            src_pos: T.int32 = copy_src_dst_pos[0, copy_length_indptr[b] + i]
                            dst_pos: T.int32 = copy_src_dst_pos[1, copy_length_indptr[b] + i]
                            pages[dst_pos // 16, 0, h, dst_pos % 16, d] = pages[src_pos // 16, 0, h, src_pos % 16, d]
                            pages[dst_pos // 16, 1, h, dst_pos % 16, d] = pages[src_pos // 16, 1, h, src_pos % 16, d]

    @T.prim_func
    def concatenate(var_reshape710: T.handle, var_reshape711: T.handle, var_reshape712: T.handle, var_T_concat: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape710 = T.match_buffer(var_reshape710, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        reshape711 = T.match_buffer(var_reshape711, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        reshape712 = T.match_buffer(var_reshape712, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        T_concat = T.match_buffer(var_T_concat, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_concat"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840))
                    v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840))
                    T.reads(reshape712[v0, T.int64(0), v1 + T.int64(-40), v2], reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2])
                    T.writes(T_concat[v0, T.int64(0), v1, v2])
                    T_concat[v0, T.int64(0), v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape712[v0, T.int64(0), v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2]))

    @T.prim_func
    def concatenate1(var_reshape387: T.handle, var_reshape388: T.handle, var_reshape389: T.handle, var_T_concat: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        reshape387 = T.match_buffer(var_reshape387, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        reshape388 = T.match_buffer(var_reshape388, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        reshape389 = T.match_buffer(var_reshape389, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        T_concat = T.match_buffer(var_T_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_concat"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840))
                    v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840))
                    T.reads(reshape389[T.int64(0), v0, v1 + T.int64(-40), v2], reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2])
                    T.writes(T_concat[T.int64(0), v0, v1, v2])
                    T_concat[T.int64(0), v0, v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape389[T.int64(0), v0, v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2]))

    @T.prim_func
    def copy_single_page(var_pages: T.handle, src_page_id: T.int64, tgt_page_id: T.int64, copy_length: T.int64):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        num_pages, page_size = T.int32(), T.int64()
        pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16")
        # with T.block("root"):
        for b in T.thread_binding((copy_length * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for t in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("copy"):
                    vh = T.axis.spatial(20, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) // (copy_length * T.int64(64))))
                    vp = T.axis.spatial(copy_length, (b * T.int64(1024) + T.Cast("int64", t)) % (copy_length * T.int64(64)) // T.int64(64))
                    vd = T.axis.spatial(64, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) % T.int64(64)))
                    T.reads(pages[src_page_id, 0:2, vh, vp, vd])
                    T.writes(pages[tgt_page_id, 0:2, vh, vp, vd])
                    pages[tgt_page_id, 0, vh, vp, vd] = pages[src_page_id, 0, vh, vp, vd]
                    pages[tgt_page_id, 1, vh, vp, vd] = pages[src_page_id, 1, vh, vp, vd]

    @T.prim_func
    def cumsum(var_sorted_probs: T.handle, var_lv1: T.handle, var_exclusive_scan_thrust: T.handle):
        T.func_attr({"tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int64(), T.int64()
        data_buf = T.match_buffer(var_sorted_probs, (batch_size, vocab_size), align=8)
        workspace_buf = T.match_buffer(var_lv1, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8)
        output_buf = T.match_buffer(var_exclusive_scan_thrust, (batch_size, vocab_size), align=8)
        with T.block("exclusive_scan_thrust"):
            T.reads()
            T.writes()
            T.call_packed("tvm.contrib.thrust.sum_scan", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(output_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.bool(False), T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0)))

    @T.prim_func
    def full(var_result: T.handle, value: T.int32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        batch_size = T.int32(is_size_var=True)
        result = T.match_buffer(var_result, (batch_size, 1), "int32")
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding((batch_size + 1023) // 1024, thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("block"):
                    v0 = T.axis.spatial(batch_size, ax0_fused_0 * 1024 + ax0_fused_1)
                    T.where(ax0_fused_0 * 1024 + ax0_fused_1 < batch_size)
                    T.reads()
                    T.writes(result[v0, 0])
                    result[v0, 0] = value

    @T.prim_func
    def fused_NT_matmul1_add8_gelu2(layer_norm358: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_weight5: T.Buffer((T.int64(5120), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_bias5: T.Buffer((T.int64(5120),), "float16"), T_multiply_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local")
        NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local")
        NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local")
        model_decoder_layers_0_fc1_weight5_local = T.alloc_buffer((T.int64(5120), T.int64(1280)), "float16", scope="local")
        layer_norm358_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(1280), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(1)):
                                        with T.block("layer_norm358_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3)
                                            T.reads(layer_norm358[v0, v1, v2])
                                            T.writes(layer_norm358_shared[v0, v1, v2])
                                            layer_norm358_shared[v0, v1, v2] = layer_norm358[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(2)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_layers_0_fc1_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.reads(model_decoder_layers_0_fc1_weight5[v0, v1])
                                    T.writes(model_decoder_layers_0_fc1_weight5_local[v0, v1])
                                    model_decoder_layers_0_fc1_weight5_local[v0, v1] = model_decoder_layers_0_fc1_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0])
                                    T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused])
                                    T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused] * model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0)
                                v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0)
                            v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0)
                            NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]
            for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for ax0_fused_2 in range(T.int64(1)):
                    with T.block("T_multiply_2"):
                        v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2)
                        T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc1_bias5[v0])
                        T.writes(T_multiply_intermediate[T.int64(0), T.int64(0), v0])
                        T_multiply_intermediate[T.int64(0), T.int64(0), v0] = (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * T.float16(0.70710678118654757)))) * T.float16(0.5))

    @T.prim_func
    def fused_NT_matmul2_add7_add6(gelu130: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_weight5: T.Buffer((T.int64(1280), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_bias5: T.Buffer((T.int64(1280),), "float16"), add1227: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        model_decoder_layers_0_fc2_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(5120)), "float16", scope="local")
        gelu130_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(2)):
                                        with T.block("gelu130_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(5120), ax2_0 * T.int64(1024) + ax2_1 * T.int64(64) + ax2_2 * T.int64(2) + ax2_3)
                                            T.reads(gelu130[v0, v1, v2])
                                            T.writes(gelu130_shared[v0, v1, v2])
                                            gelu130_shared[v0, v1, v2] = gelu130[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(20), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(4)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_layers_0_fc2_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(5120), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.reads(model_decoder_layers_0_fc2_weight5[v0, v1])
                                    T.writes(model_decoder_layers_0_fc2_weight5_local[v0, v1])
                                    model_decoder_layers_0_fc2_weight5_local[v0, v1] = model_decoder_layers_0_fc2_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2])
                                    T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)])
                                    T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0)
                            v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0)
                            NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]
            for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0_fused_2 in range(T.int64(1)):
                    with T.block("T_add_1"):
                        v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2)
                        T.reads(add1227[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc2_bias5[v0])
                        T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0])
                        T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1227[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc2_bias5[v0])

    @T.prim_func
    def fused_NT_matmul_add7(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_bias5: T.Buffer((T.int64(1280),), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local")
        layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(1)):
                                        with T.block("layer_norm356_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3)
                                            T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280))
                                            T.reads(layer_norm356[v0, v1, v2])
                                            T.writes(layer_norm356_shared[v0, v1, v2])
                                            layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(4)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1])
                                    T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1])
                                    model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2])
                                    T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)])
                                    T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0)
                            v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0)
                            NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]
            for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0_fused_2 in range(T.int64(1)):
                    with T.block("T_add"):
                        v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2)
                        T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_q_proj_bias5[v0])
                        T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0])
                        T_add_intermediate[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_q_proj_bias5[v0]

    @T.prim_func
    def fused_NT_matmul_add7_add6(reshape1361: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_bias5: T.Buffer((T.int64(1280),), "float16"), add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local")
        model_decoder_layers_0_self_attn_out_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local")
        reshape1361_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared")
        for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"):
            for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                        for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
                            for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                                for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                                    for ax2_3 in T.vectorized(T.int64(1)):
                                        with T.block("reshape1361_shared"):
                                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                            v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3)
                                            T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280))
                                            T.reads(reshape1361[v0, v1, v2])
                                            T.writes(reshape1361_shared[v0, v1, v2])
                                            reshape1361_shared[v0, v1, v2] = reshape1361[v0, v1, v2]
                    for u_fused_ax0_fused_fused_2_init in range(T.int64(1)):
                        for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0)
                    for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax0_ax1_fused_0 in range(T.int64(4)):
                            for ax0_ax1_fused_1 in T.vectorized(T.int64(2)):
                                with T.block("model_decoder_layers_0_self_attn_out_proj_weight5_local"):
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1)
                                    v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1)
                                    T.reads(model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1])
                                    T.writes(model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1])
                                    model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1]
                        for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)):
                            for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1)
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2)
                                    vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2])
                                    T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)])
                                    T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]
            for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        for ax2_fused_2_1 in T.vectorized(T.int64(1)):
                            with T.block("NT_matmul_rf_init"):
                                vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0)
                                v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                T.reads()
                                T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0)
                            for ax1 in range(T.int64(4)):
                                with T.block("NT_matmul_rf_update"):
                                    vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1])
                                    v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1)
                                    T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0])
                                    T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                                    NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]
            for ax1_fused_2 in range(T.int64(1)):
                for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                    for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                        with T.block("NT_matmul"):
                            vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0)
                            v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2)
                            T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0])
                            T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0])
                            with T.init():
                                NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0)
                            NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]
            for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"):
                for ax0_fused_2 in range(T.int64(1)):
                    with T.block("T_add_1"):
                        v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2)
                        T.reads(add1220[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_out_proj_bias5[v0])
                        T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0])
                        T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1220[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_out_proj_bias5[v0])

    @T.prim_func
    def fused_add4_maximum_minimum(p_add4: T.handle, p_lv611: T.handle, p_output0: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        add4 = T.match_buffer(p_add4, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        lv611 = T.match_buffer(p_lv611, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        T_minimum_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_minimum"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280))
                    T.reads(add4[v0, v1, v2], lv611[v0, v1, v2])
                    T.writes(T_minimum_intermediate[v0, v1, v2])
                    T_minimum_intermediate[v0, v1, v2] = T.min(T.max(add4[v0, v1, v2] + lv611[v0, v1, v2], T.float16(-65504)), T.float16(65504))

    @T.prim_func
    def fused_conv1d1_add2_gelu1(p_gelu: T.handle, model_encoder_conv2_weight: T.Buffer((T.int64(1280), T.int64(1280), T.int64(3)), "float16"), lv3: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        gelu = T.match_buffer(p_gelu, (batch_size, T.int64(1280), T.int64(3000)), "float16")
        T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(1500)), "float16")
        # with T.block("root"):
        conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(1500)), "float16", scope="shared")
        for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(1920000), thread="blockIdx.x"):
            for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)):
                for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax3_ax4_fused_0 in T.serial(T.int64(15), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("conv1d_ncw"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000) + ax0)
                            v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500) + ax1)
                            v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500) + ax2)
                            v3 = T.axis.reduce(T.int64(1280), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3))
                            v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3))
                            T.reads(gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], model_encoder_conv2_weight[v1, v3, v4])
                            T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2])
                            with T.init():
                                conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0)
                            conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 * T.int64(2) + v4 and v2 * T.int64(2) + v4 < T.int64(3001), gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv2_weight[v1, v3, v4]
            for ax3 in range(T.int64(1)):
                for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("T_multiply_2"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000))
                            v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500))
                            v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500))
                            v3 = T.axis.spatial(T.int64(1), ax3)
                            v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1)
                            T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1))
                            T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv3[T.int64(0), v1, T.int64(0)])
                            T.writes(T_multiply_intermediate[v0, v1, v2])
                            T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5))

    @T.prim_func
    def fused_conv1d_add1_gelu(p_input_features: T.handle, model_encoder_conv1_weight: T.Buffer((T.int64(1280), T.int64(128), T.int64(3)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        input_features = T.match_buffer(p_input_features, (batch_size, T.int64(128), T.int64(3000)), "float16")
        T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(3000)), "float16")
        # with T.block("root"):
        conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(3000)), "float16", scope="shared")
        for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(3840000), thread="blockIdx.x"):
            for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)):
                for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax3_ax4_fused_0 in T.serial(T.int64(2), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("conv1d_ncw"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000) + ax0)
                            v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000) + ax1)
                            v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000) + ax2)
                            v3 = T.axis.reduce(T.int64(128), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3))
                            v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3))
                            T.where(ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1 < T.int64(384))
                            T.reads(input_features[v0, v3, v2 + v4 - T.int64(1)], model_encoder_conv1_weight[v1, v3, v4])
                            T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2])
                            with T.init():
                                conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0)
                            conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 + v4 and v2 + v4 < T.int64(3001), input_features[v0, v3, v2 + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv1_weight[v1, v3, v4]
            for ax3 in range(T.int64(1)):
                for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("T_multiply_2"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000))
                            v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000))
                            v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000))
                            v3 = T.axis.spatial(T.int64(1), ax3)
                            v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1)
                            T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1))
                            T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv1[T.int64(0), v1, T.int64(0)])
                            T.writes(T_multiply_intermediate[v0, v1, v2])
                            T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5))

    @T.prim_func
    def fused_reshape20_reshape20_add6(take7: T.Buffer((T.int64(1), T.int64(1280)), "float16"), take8: T.Buffer((T.int64(1), T.int64(1280)), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_add"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(take7[T.int64(0), v0], take8[T.int64(0), v0])
                    T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0])
                    T_add_intermediate[T.int64(0), T.int64(0), v0] = take7[T.int64(0), v0] + take8[T.int64(0), v0]

    @T.prim_func
    def fused_reshape21_reshape21_reshape21_concatenate2_reshape22(add1221: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), add1222: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1_2_3: T.Buffer((T.int64(1), T.int64(60), T.int64(64)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding(T.int64(4), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape_3"):
                    v0 = T.axis.spatial(T.int64(60), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64))
                    v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(3840))
                    T.reads(add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1])
                    T.writes(T_reshape_intermediate_1_2_3[T.int64(0), v0, v1])
                    T_reshape_intermediate_1_2_3[T.int64(0), v0, v1] = T.if_then_else(T.int64(40) <= v0, add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], T.if_then_else(T.int64(20) <= v0, lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]))

    @T.prim_func
    def fused_reshape21_reshape25(add1225: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape_1"):
                    v0 = T.axis.spatial(T.int64(20), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64))
                    v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(1280))
                    T.reads(add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1])
                    T.writes(T_reshape_intermediate_1[T.int64(0), v0, v1])
                    T_reshape_intermediate_1[T.int64(0), v0, v1] = add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]

    @T.prim_func
    def fused_reshape23_reshape24(lv265: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape_1"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)])
                    T.writes(T_reshape_intermediate_1[T.int64(0), T.int64(0), v0])
                    T_reshape_intermediate_1[T.int64(0), T.int64(0), v0] = lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)]

    @T.prim_func
    def fused_reshape9(packed_params_1: T.Buffer((T.int64(1280),), "float16"), T_reshape_intermediate: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(packed_params_1[v0])
                    T.writes(T_reshape_intermediate[T.int64(0), v0, T.int64(0)])
                    T_reshape_intermediate[T.int64(0), v0, T.int64(0)] = packed_params_1[v0]

    @T.prim_func
    def fused_rope(var_qkv: T.handle, var_position_map: T.handle, var_q: T.handle, var_k: T.handle, var_v: T.handle, apply_rope: T.int32):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        qkv = T.match_buffer(var_qkv, (seq_len, 60, 64), "float16")
        position_map = T.match_buffer(var_position_map, (seq_len,), "int32", offset_factor=1)
        q = T.match_buffer(var_q, (seq_len, 20, 64), "float16")
        k = T.match_buffer(var_k, (seq_len, 20, 64), "float16")
        v = T.match_buffer(var_v, (seq_len, 20, 64), "float16")
        # with T.block("root"):
        for iters_0_iters_1_iters_2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for iters_0_iters_1_iters_2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("llama_fused_rope"):
                    s = T.axis.spatial(seq_len, (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) // T.int64(3840))
                    h = T.axis.spatial(60, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(3840) // T.int64(64)))
                    d = T.axis.spatial(64, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(64)))
                    T.where(iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1 < seq_len * T.int64(3840))
                    T.reads(position_map[s], qkv[s, h, d - 32:d - 32 + 65])
                    T.writes(q[s, h, d], k[s, h - 20, d], v[s, h - 40, d])
                    if h < 20:
                        q[s, h, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d])
                    else:
                        if h < 40:
                            k[s, h - 20, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d])
                        else:
                            v[s, h - 40, d] = qkv[s, h, d]

    @T.prim_func
    def fused_transpose_add3(packed_params_4: T.Buffer((T.int64(1500), T.int64(1280)), "float16"), p_gelu1: T.handle, p_output0: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        gelu1 = T.match_buffer(p_gelu1, (batch_size, T.int64(1280), T.int64(1500)), "float16")
        T_add_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_add"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280))
                    T.reads(gelu1[v0, v2, v1], packed_params_4[v1, v2])
                    T.writes(T_add_intermediate[v0, v1, v2])
                    T_add_intermediate[v0, v1, v2] = gelu1[v0, v2, v1] + packed_params_4[v1, v2]

    @T.prim_func
    def gather_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        m, n = T.int32(is_size_var=True), T.int32(is_size_var=True)
        src = T.match_buffer(var_src, (m, n))
        batch_size = T.int32(is_size_var=True)
        indices = T.match_buffer(var_indices, (batch_size,), "int32")
        dst = T.match_buffer(var_dst, (batch_size, n))
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("gather_2d"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n)
                    v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n)
                    T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n)
                    T.reads(src[indices[v0], v1], indices[v0])
                    T.writes(dst[v0, v1])
                    dst[v0, v1] = src[indices[v0], v1]

    @T.prim_func
    def get_index_from_sorted(A: T.handle, B: T.handle, C: T.handle, D: T.handle, E: T.handle, F: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        batch, vocab_size = T.int64(), T.int64()
        cumsum_sorted = T.match_buffer(A, (batch, vocab_size))
        indices = T.match_buffer(B, (batch, vocab_size), "int32")
        renorm_prob = T.match_buffer(C, (batch, 1))
        out_batch = T.int64()
        usample = T.match_buffer(D, (out_batch, 1))
        sample_indices = T.match_buffer(E, (out_batch, 1), "int32")
        output_index = T.match_buffer(F, (out_batch, 1), "int32")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((out_batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_get_index_from_sorted"):
                    v0 = T.axis.spatial(out_batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * out_batch) // vocab_size)
                    v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size)
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < out_batch * vocab_size)
                    T.reads(usample[v0, T.int64(0)], cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1):v1 - T.int64(1) + T.int64(2)], sample_indices[v0, T.int64(0)], renorm_prob[sample_indices[v0, T.int64(0)], 0], indices[sample_indices[v0, T.int64(0)], T.min(T.int64(0), v1):T.min(T.int64(0), v1) + (v1 + T.int64(1))])
                    T.writes(output_index[v0, 0])
                    if usample[v0, T.int64(0)] < cumsum_sorted[sample_indices[v0, T.int64(0)], v1] / renorm_prob[sample_indices[v0, T.int64(0)], 0] or v1 + T.int64(1) == vocab_size:
                        if v1 == T.int64(0):
                            output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], 0]
                        else:
                            if usample[v0, T.int64(0)] >= cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1)] / renorm_prob[sample_indices[v0, T.int64(0)], 0]:
                                output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], v1]

    @T.prim_func
    def get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        batch, vocab_size = T.int64(), T.int64()
        cumsum_sorted = T.match_buffer(A, (batch, vocab_size))
        top_p = T.match_buffer(B, (batch, 1))
        top_k = T.match_buffer(C, (batch, 1), "int32")
        renorm_prob = T.match_buffer(D, (batch, 1))
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_get_renorm_prob"):
                    v0 = T.axis.spatial(batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * batch) // vocab_size)
                    v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size)
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch * vocab_size)
                    T.reads(cumsum_sorted[v0, T.min(T.min(T.int64(0), v1), v1 + T.int64(1)):T.min(T.min(T.int64(0), v1), v1 + T.int64(1)) + (v1 + T.int64(2))], top_p[v0, 0], top_k[v0, 0])
                    T.writes(renorm_prob[v0, 0])
                    if not (cumsum_sorted[v0, 0] < top_p[v0, 0] and top_k[v0, 0] > 1):
                        renorm_prob[v0, 0] = cumsum_sorted[v0, 0]
                    else:
                        if cumsum_sorted[v0, v1] < top_p[v0, 0] and v1 + T.int64(1) < T.Cast("int64", top_k[v0, 0]):
                            if v1 + T.int64(1) == vocab_size:
                                renorm_prob[v0, 0] = cumsum_sorted[v0, v1]
                            else:
                                if not (cumsum_sorted[v0, v1 + T.int64(1)] < top_p[v0, 0] and v1 + T.int64(1) + T.int64(1) < T.Cast("int64", top_k[v0, 0])):
                                    renorm_prob[v0, 0] = cumsum_sorted[v0, v1 + T.int64(1)]

    @T.prim_func
    def index(var_layer_norm355: T.handle, index: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        layer_norm355 = T.match_buffer(var_layer_norm355, (T.int64(1), seq_len, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("index"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(layer_norm355[T.int64(0), seq_len - T.int64(1), v0])
                    T.writes(index[T.int64(0), T.int64(0), v0])
                    index[T.int64(0), T.int64(0), v0] = layer_norm355[T.int64(0), seq_len - T.int64(1), v0]

    @T.prim_func
    def layer_norm(var_add578: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight3: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias3: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        add578 = T.match_buffer(var_add578, (batch_size, T.int64(1), T.int64(1280)), "float16")
        T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1), T.int64(1280)), "float16")
        # with T.block("root"):
        add578_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared")
        add578_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared")
        for ax0_fused in T.thread_binding(batch_size, thread="blockIdx.x"):
            for ax0 in range(T.int64(1)):
                for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("add578_red_temp"):
                            v0 = T.axis.spatial(batch_size, ax0_fused + ax0)
                            v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1)
                            T.reads(add578[v0, T.int64(0), v1])
                            T.writes(add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)])
                            with T.init():
                                add578_red_temp_v0_shared[v0, T.int64(0)] = T.float32(0)
                                add578_red_temp_v1_shared[v0, T.int64(0)] = T.float32(0)
                            v_add578_red_temp_v0: T.float32 = add578_red_temp_v0_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1])
                            v_add578_red_temp_v1: T.float32 = add578_red_temp_v1_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1]) * T.Cast("float32", add578[v0, T.int64(0), v1])
                            add578_red_temp_v0_shared[v0, T.int64(0)] = v_add578_red_temp_v0
                            add578_red_temp_v1_shared[v0, T.int64(0)] = v_add578_red_temp_v1
            for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                    with T.block("T_layer_norm"):
                        v0 = T.axis.spatial(batch_size, ax0_fused)
                        v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1)
                        T.reads(add578[v0, T.int64(0), v1], add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight3[v1], model_decoder_layers_0_self_attn_layer_norm_bias3[v1])
                        T.writes(T_layer_norm[v0, T.int64(0), v1])
                        T_layer_norm[v0, T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add578[v0, T.int64(0), v1]) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add578_red_temp_v1_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) * (add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight3[v1] + model_decoder_layers_0_self_attn_layer_norm_bias3[v1]

    @T.prim_func
    def layer_norm1(var_add: T.handle, model_encoder_layers_0_self_attn_layer_norm_weight: T.Buffer((T.int64(1280),), "float16"), model_encoder_layers_0_self_attn_layer_norm_bias: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        # with T.block("root"):
        add_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared")
        add_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared")
        for ax0_ax1_fused in T.thread_binding(batch_size * T.int64(1500), thread="blockIdx.x"):
            for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):
                for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax2_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("add_red_temp"):
                            v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500) + ax0)
                            v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500) + ax1)
                            v2 = T.axis.reduce(T.int64(1280), ax2_fused_0 * T.int64(256) + ax2_fused_1)
                            T.reads(add[v0, v1, v2])
                            T.writes(add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1])
                            with T.init():
                                add_red_temp_v0_shared[v0, v1] = T.float32(0)
                                add_red_temp_v1_shared[v0, v1] = T.float32(0)
                            v_add_red_temp_v0: T.float32 = add_red_temp_v0_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2])
                            v_add_red_temp_v1: T.float32 = add_red_temp_v1_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2]) * T.Cast("float32", add[v0, v1, v2])
                            add_red_temp_v0_shared[v0, v1] = v_add_red_temp_v0
                            add_red_temp_v1_shared[v0, v1] = v_add_red_temp_v1
            for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                for ax2_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                    with T.block("T_layer_norm"):
                        v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500))
                        v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500))
                        v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1)
                        T.reads(add[v0, v1, v2], add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1], model_encoder_layers_0_self_attn_layer_norm_weight[v2], model_encoder_layers_0_self_attn_layer_norm_bias[v2])
                        T.writes(T_layer_norm[v0, v1, v2])
                        T_layer_norm[v0, v1, v2] = T.Cast("float16", (T.Cast("float32", add[v0, v1, v2]) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) * T.rsqrt(add_red_temp_v1_shared[v0, v1] * T.float32(0.00078125000000000004) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004) * (add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_encoder_layers_0_self_attn_layer_norm_weight[v2] + model_encoder_layers_0_self_attn_layer_norm_bias[v2]

    @T.prim_func
    def layer_norm2(var_add257: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight2: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias2: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        add257 = T.match_buffer(var_add257, (T.int64(1), seq_len, T.int64(1280)), "float16")
        T_layer_norm = T.match_buffer(var_T_layer_norm, (T.int64(1), seq_len, T.int64(1280)), "float16")
        # with T.block("root"):
        add257_red_temp_v0_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared")
        add257_red_temp_v1_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared")
        for ax0_fused in T.thread_binding(seq_len, thread="blockIdx.x"):
            for ax0 in range(T.int64(1)):
                for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("add257_red_temp"):
                            v0 = T.axis.spatial(seq_len, ax0_fused + ax0)
                            v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1)
                            T.reads(add257[T.int64(0), v0, v1])
                            T.writes(add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0])
                            with T.init():
                                add257_red_temp_v0_shared[T.int64(0), v0] = T.float32(0)
                                add257_red_temp_v1_shared[T.int64(0), v0] = T.float32(0)
                            v_add257_red_temp_v0: T.float32 = add257_red_temp_v0_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1])
                            v_add257_red_temp_v1: T.float32 = add257_red_temp_v1_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1]) * T.Cast("float32", add257[T.int64(0), v0, v1])
                            add257_red_temp_v0_shared[T.int64(0), v0] = v_add257_red_temp_v0
                            add257_red_temp_v1_shared[T.int64(0), v0] = v_add257_red_temp_v1
            for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                    with T.block("T_layer_norm"):
                        v0 = T.axis.spatial(seq_len, ax0_fused)
                        v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1)
                        T.reads(add257[T.int64(0), v0, v1], add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0], model_decoder_layers_0_self_attn_layer_norm_weight2[v1], model_decoder_layers_0_self_attn_layer_norm_bias2[v1])
                        T.writes(T_layer_norm[T.int64(0), v0, v1])
                        T_layer_norm[T.int64(0), v0, v1] = T.Cast("float16", (T.Cast("float32", add257[T.int64(0), v0, v1]) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) * T.rsqrt(add257_red_temp_v1_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) * (add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight2[v1] + model_decoder_layers_0_self_attn_layer_norm_bias2[v1]

    @T.prim_func
    def layer_norm3(add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_layer_norm_weight5: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias5: T.Buffer((T.int64(1280),), "float16"), T_layer_norm: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        add1220_red_temp_v0_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared")
        add1220_red_temp_v1_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared")
        for ax0_fused in T.thread_binding(T.int64(1), thread="blockIdx.x"):
            for ax0 in range(T.int64(1)):
                for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                    for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                        with T.block("add1220_red_temp"):
                            v0 = T.axis.spatial(T.int64(1), ax0)
                            v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1)
                            T.reads(add1220[T.int64(0), T.int64(0), v1])
                            T.writes(add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)])
                            with T.init():
                                add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = T.float32(0)
                                add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = T.float32(0)
                            v_add1220_red_temp_v0: T.float32 = add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1])
                            v_add1220_red_temp_v1: T.float32 = add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) * T.Cast("float32", add1220[T.int64(0), T.int64(0), v1])
                            add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v0
                            add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v1
            for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"):
                for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
                    with T.block("T_layer_norm"):
                        v0 = T.axis.spatial(T.int64(1), T.int64(0))
                        v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1)
                        T.reads(add1220[T.int64(0), T.int64(0), v1], add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight5[v1], model_decoder_layers_0_self_attn_layer_norm_bias5[v1])
                        T.writes(T_layer_norm[T.int64(0), T.int64(0), v1])
                        T_layer_norm[T.int64(0), T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) * (add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight5[v1] + model_decoder_layers_0_self_attn_layer_norm_bias5[v1]

    @T.prim_func
    def merge_state_inplace(v: T.handle, s: T.handle, v_other: T.handle, s_other: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        N, H, D = T.int32(is_size_var=True), T.int32(is_size_var=True), T.int32(is_size_var=True)
        V = T.match_buffer(v, (N, H, D), "float16")
        S = T.match_buffer(s, (N, H))
        V_other = T.match_buffer(v_other, (N, H, D), "float16")
        S_other = T.match_buffer(s_other, (N, H))
        # with T.block("root"):
        for bx in T.thread_binding(N, thread="blockIdx.x"):
            for by in T.thread_binding(1, thread="blockIdx.y"):
                for ty in T.thread_binding(20, thread="threadIdx.y"):
                    for tx in T.thread_binding(16, thread="threadIdx.x"):
                        with T.block("merge"):
                            T.reads(S[bx, ty + by * 20], S_other[bx, ty + by * 20], V[bx, ty + by * 20, tx * 4:tx * 4 + 4], V_other[bx, ty + by * 20, tx * 4:tx * 4 + 4])
                            T.writes(V[bx, ty + by * 20, tx * 4:tx * 4 + 4], S[bx, ty + by * 20])
                            s_val = T.alloc_buffer((1,), scope="local")
                            s_other_val = T.alloc_buffer((1,), scope="local")
                            s_max = T.alloc_buffer((1,), scope="local")
                            scale = T.alloc_buffer((1,), scope="local")
                            other_scale = T.alloc_buffer((1,), scope="local")
                            v_vec = T.alloc_buffer((4,), "float16", scope="local")
                            v_other_vec = T.alloc_buffer((4,), "float16", scope="local")
                            s_val[0] = S[bx, ty + by * 20]
                            s_other_val[0] = S_other[bx, ty + by * 20]
                            s_max[0] = T.max(s_val[0], s_other_val[0])
                            s_val[0] = T.exp2(s_val[0] - s_max[0])
                            s_other_val[0] = T.exp2(s_other_val[0] - s_max[0])
                            scale[0] = s_val[0] / (s_val[0] + s_other_val[0])
                            other_scale[0] = s_other_val[0] / (s_val[0] + s_other_val[0])
                            for vec in T.vectorized(4):
                                v_vec[vec] = V[bx, ty + by * 20, tx * 4 + vec]
                            for vec in T.vectorized(4):
                                v_other_vec[vec] = V_other[bx, ty + by * 20, tx * 4 + vec]
                            for vec in range(4):
                                v_vec[vec] = T.Cast("float16", T.Cast("float32", v_vec[vec]) * scale[0] + T.Cast("float32", v_other_vec[vec]) * other_scale[0])
                            for vec in T.vectorized(4):
                                V[bx, ty + by * 20, tx * 4 + vec] = v_vec[vec]
                            S[bx, ty + by * 20] = T.log2(s_val[0] + s_other_val[0]) + s_max[0]

    @T.prim_func
    def parallel_sampling_from_prob(var_prob: T.handle, var_uniform_samples: T.handle, var_row_indices: T.handle, var_sampled_token_ids: T.handle):
        T.func_attr({"tir.is_scheduled": 1})
        n, vocab_size = T.int64(), T.int64()
        prob = T.match_buffer(var_prob, (n, vocab_size))
        batch_size = T.int64()
        uniform_samples = T.match_buffer(var_uniform_samples, (batch_size, 1))
        row_indices = T.match_buffer(var_row_indices, (batch_size, 1), "int32")
        token_ids = T.match_buffer(var_sampled_token_ids, (batch_size, 1), "int32")
        # with T.block("root"):
        aggregate = T.alloc_buffer((), scope="local")
        sample_id_local = T.alloc_buffer((), "int32", scope="local")
        step_iter = T.alloc_buffer((), "int32", scope="local")
        for bx in T.thread_binding(batch_size, thread="blockIdx.x"):
            row_idx: T.int32 = row_indices[bx, 0]
            for ty in T.thread_binding(T.int64(4), thread="threadIdx.y"):
                for tx in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                    u: T.float32 = uniform_samples[bx, 0]
                    aggregate[()] = T.Cast("float32", 0)
                    step_iter[()] = 0
                    while T.tvm_thread_invariant((step_iter[()] == 0 or aggregate[()] < u - T.float32(9.9999999999999995e-07)) and T.Cast("int64", step_iter[()]) < (vocab_size + T.int64(512) - T.int64(1)) // T.int64(512)):
                        with T.block(""):
                            T.reads(step_iter[()], prob[row_idx, T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4):T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + T.int64(4)], aggregate[()])
                            T.writes(sample_id_local[()], aggregate[()])
                            prob_gt_threshold = T.alloc_buffer((T.int64(4),), scope="local")
                            cumsum = T.alloc_buffer((T.int64(512),), scope="shared")
                            greater_than_u = T.alloc_buffer((T.int64(4),), "bool", scope="local")
                            mask = T.alloc_buffer((T.int64(4),), "bool", scope="local")
                            valid = T.alloc_buffer((T.int64(4),), "bool", scope="local")
                            indices = T.alloc_buffer((T.int64(4),), "int32", scope="local")
                            step_aggregate = T.alloc_buffer((), scope="local")
                            for v in T.unroll(T.int64(4)):
                                idx: T.int64 = T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v
                                prob_local: T.float32 = T.if_then_else(idx < vocab_size, prob[row_idx, idx], T.Cast("float32", 0))
                                prob_gt_threshold[v] = T.if_then_else(prob_local > T.float32(0), prob_local, T.Cast("float32", 0))
                                valid[v] = prob_local > T.float32(0) and idx < vocab_size
                            with T.block(""):
                                T.reads(prob_gt_threshold[T.int64(0):T.int64(4)])
                                T.writes(step_aggregate[()])
                                local_sum = T.alloc_buffer((), scope="local")
                                shared_buf = T.alloc_buffer((T.int64(128),), scope="shared")
                                idx: T.int64 = ty * T.int64(32) + tx
                                local_sum[()] = T.Cast("float32", 0)
                                for i in T.unroll(T.int64(4)):
                                    local_sum[()] = local_sum[()] + prob_gt_threshold[i]
                                shared_buf[idx] = local_sum[()]
                                for i in T.unroll(T.int64(7)):
                                    if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0):
                                        shared_buf[idx] = shared_buf[idx] + shared_buf[idx + T.shift_left(T.int64(1), i)]
                                step_aggregate[()] = shared_buf[0]
                            if T.tvm_thread_invariant(aggregate[()] + step_aggregate[()] >= u - T.float32(9.9999999999999995e-07)):
                                for i in T.unroll(T.int64(1), T.int64(4)):
                                    prob_gt_threshold[i] = prob_gt_threshold[i] + prob_gt_threshold[i - T.int64(1)]
                                for i in T.vectorized(T.int64(4)):
                                    cumsum[ty * T.int64(128) + tx * T.int64(4) + i] = prob_gt_threshold[i]
                                for i in T.unroll(T.int64(5)):
                                    for j in T.vectorized(T.int64(4)):
                                        idx: T.int64 = ty * T.int64(128) + tx * T.int64(4)
                                        if tx >= T.shift_left(T.int64(1), i):
                                            cumsum[idx + j] = cumsum[idx + j] + cumsum[idx - T.shift_left(T.int64(1), i) * T.int64(4) + T.int64(4) - T.int64(1)]
                                for i in T.unroll(T.int64(1), T.int64(4)):
                                    for j in T.vectorized(T.int64(4)):
                                        if ty == T.int64(0):
                                            idx: T.int64 = i * T.int64(128) + tx * T.int64(4)
                                            cumsum[idx + j] = cumsum[idx + j] + cumsum[i * T.int64(128) - T.int64(1)]
                                for v in T.unroll(T.int64(4)):
                                    greater_than_u[v] = cumsum[ty * T.int64(128) + tx * T.int64(4) + v] + aggregate[()] >= u - T.float32(9.9999999999999995e-07)
                                with T.block(""):
                                    T.reads(greater_than_u[T.int64(0):T.int64(4)])
                                    T.writes(mask[T.int64(0):T.int64(4)])
                                    shared_buf = T.alloc_buffer((T.int64(128),), "bool", scope="shared")
                                    tx_idx: T.int64 = ty * T.int64(32) + tx
                                    shared_buf[tx_idx] = greater_than_u[T.int64(3)]
                                    mask[0] = T.if_then_else(tx_idx != T.int64(0), T.Cast("int8", greater_than_u[0]) != T.Cast("int8", shared_buf[tx_idx - T.int64(1)]), greater_than_u[0])
                                    for i in T.unroll(T.int64(1), T.int64(4)):
                                        mask[i] = T.Cast("int8", greater_than_u[i]) != T.Cast("int8", greater_than_u[i - T.int64(1)])
                                for v in T.unroll(T.int64(4)):
                                    mask[v] = mask[v] and valid[v]
                                    indices[v] = T.Cast("int32", T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v)
                                with T.block(""):
                                    T.reads(mask[T.int64(0):T.int64(4)], indices[T.int64(0):T.int64(4)])
                                    T.writes(sample_id_local[()])
                                    local_sum = T.alloc_buffer((), "int32", scope="local")
                                    shared_buf = T.alloc_buffer((T.int64(128),), "int32", scope="shared")
                                    idx: T.int64 = ty * T.int64(32) + tx
                                    local_sum[()] = T.Cast("int32", vocab_size - T.int64(1))
                                    for i in T.unroll(T.int64(4)):
                                        if mask[i]:
                                            local_sum[()] = T.min(local_sum[()], indices[i])
                                    shared_buf[idx] = local_sum[()]
                                    for i in T.unroll(T.int64(7)):
                                        if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0):
                                            shared_buf[idx] = T.min(shared_buf[idx], shared_buf[idx + T.shift_left(T.int64(1), i)])
                                    sample_id_local[()] = shared_buf[0]
                            aggregate[()] = aggregate[()] + step_aggregate[()]
                        step_iter[()] = step_iter[()] + 1
                    if tx == T.int64(0) and ty == T.int64(0):
                        token_ids[bx, 0] = sample_id_local[()]

    @T.prim_func
    def reshape(var_lv: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        lv = T.match_buffer(var_lv, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64))
                    v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64))
                    T.reads(lv[v0, v1, v2 * T.int64(64) + v3])
                    T.writes(T_reshape[v0, v1, v2, v3])
                    T_reshape[v0, v1, v2, v3] = lv[v0, v1, v2 * T.int64(64) + v3]

    @T.prim_func
    def reshape1(var_reshape256: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape256 = T.match_buffer(var_reshape256, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size * T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.reads(reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2]

    @T.prim_func
    def reshape10(var_lv4: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        lv4 = T.match_buffer(var_lv4, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64))
                    v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64))
                    T.reads(lv4[v0 * T.int64(1500) + v1, v2, v3])
                    T.writes(T_reshape[v0, v1, v2, v3])
                    T_reshape[v0, v1, v2, v3] = lv4[v0 * T.int64(1500) + v1, v2, v3]

    @T.prim_func
    def reshape11(var_reshape6: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape6 = T.match_buffer(var_reshape6, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000))
                    v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280))
                    v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280))
                    T.reads(reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)]

    @T.prim_func
    def reshape12(var_input_ids: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        input_ids = T.match_buffer(var_input_ids, (T.int64(1), seq_len), "int32")
        T_reshape = T.match_buffer(var_T_reshape, (seq_len,), "int32")
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding((seq_len + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < seq_len)
                    T.reads(input_ids[T.int64(0), v0])
                    T.writes(T_reshape[v0])
                    T_reshape[v0] = input_ids[T.int64(0), v0]

    @T.prim_func
    def reshape13(var_take: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        take = T.match_buffer(var_take, (seq_len, T.int64(1280)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280))
                    T.reads(take[v0, v1])
                    T.writes(T_reshape[T.int64(0), v0, v1])
                    T_reshape[T.int64(0), v0, v1] = take[v0, v1]

    @T.prim_func
    def reshape14(var_lv416: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        lv416 = T.match_buffer(var_lv416, (T.int64(1), seq_len, T.int64(1280)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280))
                    T.reads(lv416[T.int64(0), v0, v1 * T.int64(64) + v2])
                    T.writes(T_reshape[T.int64(0), v0, v1, v2])
                    T_reshape[T.int64(0), v0, v1, v2] = lv416[T.int64(0), v0, v1 * T.int64(64) + v2]

    @T.prim_func
    def reshape15(var_concat: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        concat = T.match_buffer(var_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(60), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840))
                    v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840))
                    T.reads(concat[T.int64(0), v0, v1, v2])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = concat[T.int64(0), v0, v1, v2]

    @T.prim_func
    def reshape16(var_lv69: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        lv69 = T.match_buffer(var_lv69, (seq_len, T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280))
                    T.reads(lv69[v0, v1, v2])
                    T.writes(T_reshape[T.int64(0), v0, v1, v2])
                    T_reshape[T.int64(0), v0, v1, v2] = lv69[v0, v1, v2]

    @T.prim_func
    def reshape17(var_reshape391: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        reshape391 = T.match_buffer(var_reshape391, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280))
                    T.reads(reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)])
                    T.writes(T_reshape[T.int64(0), v0, v1])
                    T_reshape[T.int64(0), v0, v1] = reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)]

    @T.prim_func
    def reshape18(var_reshape393: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        reshape393 = T.match_buffer(var_reshape393, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280))
                    T.reads(reshape393[T.int64(0), v0, v1, v2])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = reshape393[T.int64(0), v0, v1, v2]

    @T.prim_func
    def reshape19(input_ids: T.Buffer((T.int64(1), T.int64(1)), "int32"), T_reshape: T.Buffer((T.int64(1),), "int32")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(1), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(T.int64(1), T.int64(0))
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1))
                    T.reads(input_ids[T.int64(0), T.int64(0)])
                    T.writes(T_reshape[T.int64(0)])
                    T_reshape[T.int64(0)] = input_ids[T.int64(0), T.int64(0)]

    @T.prim_func
    def reshape2(var_input_ids: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        input_ids = T.match_buffer(var_input_ids, (batch_size, T.int64(1)), "int32")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size,), "int32")
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding((batch_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < batch_size)
                    T.reads(input_ids[v0, T.int64(0)])
                    T.writes(T_reshape[v0])
                    T_reshape[v0] = input_ids[v0, T.int64(0)]

    @T.prim_func
    def reshape3(var_take3: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        take3 = T.match_buffer(var_take3, (batch_size, T.int64(1280)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(take3[v0, v1])
                    T.writes(T_reshape[v0, T.int64(0), v1])
                    T_reshape[v0, T.int64(0), v1] = take3[v0, v1]

    @T.prim_func
    def reshape4(var_lv224: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        lv224 = T.match_buffer(var_lv224, (batch_size, T.int64(1), T.int64(1280)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280))
                    T.reads(lv224[v0, T.int64(0), v1 * T.int64(64) + v2])
                    T.writes(T_reshape[v0, T.int64(0), v1, v2])
                    T_reshape[v0, T.int64(0), v1, v2] = lv224[v0, T.int64(0), v1 * T.int64(64) + v2]

    @T.prim_func
    def reshape5(var_concat32: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        concat32 = T.match_buffer(var_concat32, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(60), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840))
                    v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840))
                    T.reads(concat32[v0, T.int64(0), v1, v2])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = concat32[v0, T.int64(0), v1, v2]

    @T.prim_func
    def reshape6(var_lv134: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        lv134 = T.match_buffer(var_lv134, (batch_size, T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280))
                    T.reads(lv134[v0, v1, v2])
                    T.writes(T_reshape[v0, T.int64(0), v1, v2])
                    T_reshape[v0, T.int64(0), v1, v2] = lv134[v0, v1, v2]

    @T.prim_func
    def reshape7(var_reshape714: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape714 = T.match_buffer(var_reshape714, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)])
                    T.writes(T_reshape[v0, T.int64(0), v1])
                    T_reshape[v0, T.int64(0), v1] = reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)]

    @T.prim_func
    def reshape8(var_reshape716: T.handle, var_T_reshape: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape716 = T.match_buffer(var_reshape716, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16")
        T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(20), T.int64(64)), "float16")
        # with T.block("root"):
        for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_reshape"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64))
                    v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64))
                    T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280))
                    T.reads(reshape716[v0, T.int64(0), v1, v2])
                    T.writes(T_reshape[v0, v1, v2])
                    T_reshape[v0, v1, v2] = reshape716[v0, T.int64(0), v1, v2]

    @T.prim_func
    def sampler_take_probs_tir(var_unsorted_probs: T.handle, var_sorted_indices: T.handle, var_sample_indices: T.handle, var_sampling_results: T.handle, var_top_prob_offsets: T.handle, var_sampled_values: T.handle, var_top_prob_probs: T.handle, var_top_prob_indices: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1})
        batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True)
        unsorted_probs = T.match_buffer(var_unsorted_probs, (batch_size, vocab_size))
        sorted_indices = T.match_buffer(var_sorted_indices, (batch_size, vocab_size), "int32")
        num_samples = T.int32(is_size_var=True)
        sample_indices = T.match_buffer(var_sample_indices, (num_samples,), "int32")
        sampling_results = T.match_buffer(var_sampling_results, (num_samples,), "int32")
        num_positions = T.int32(is_size_var=True)
        top_prob_offsets = T.match_buffer(var_top_prob_offsets, (num_positions,), "int32")
        sampled_values = T.match_buffer(var_sampled_values, (num_samples,))
        top_prob_probs = T.match_buffer(var_top_prob_probs, (num_positions,))
        top_prob_indices = T.match_buffer(var_top_prob_indices, (num_positions,), "int32")
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding((num_positions + num_samples + 1023) // 1024, thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("block"):
                    v0 = T.axis.spatial(num_positions + num_samples, ax0_fused_0 * 1024 + ax0_fused_1)
                    T.where(ax0_fused_0 * 1024 + ax0_fused_1 < num_positions + num_samples)
                    T.reads(top_prob_offsets[v0], sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], unsorted_probs[T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]):T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]) + (T.max(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions]) + 1 - T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions])), T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]):T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]) + (T.max(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]) + 1 - T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]))], sample_indices[v0 + (0 - num_positions)], sampling_results[v0 + (0 - num_positions)])
                    T.writes(top_prob_indices[v0], top_prob_probs[v0], sampled_values[v0 + (0 - num_positions)])
                    if v0 < num_positions:
                        row: T.int32 = top_prob_offsets[v0] // vocab_size
                        col: T.int32 = top_prob_offsets[v0] % vocab_size
                        top_prob_indices[v0] = sorted_indices[row, col]
                        top_prob_probs[v0] = unsorted_probs[row, sorted_indices[row, col]]
                    else:
                        vj: T.int32 = v0 - num_positions
                        sampled_values[vj] = unsorted_probs[sample_indices[vj], sampling_results[vj]]

    @T.prim_func
    def scatter_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size, n = T.int32(is_size_var=True), T.int32(is_size_var=True)
        src = T.match_buffer(var_src, (batch_size, n))
        indices = T.match_buffer(var_indices, (batch_size,), "int32")
        m = T.int32(is_size_var=True)
        dst = T.match_buffer(var_dst, (m, n))
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("scatter_2d"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n)
                    v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n)
                    T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n)
                    T.reads(src[v0, v1], indices[v0])
                    T.writes(dst[indices[v0], v1])
                    dst[indices[v0], v1] = src[v0, v1]

    @T.prim_func
    def shape_func(H: T.Buffer((T.int64(2),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(1)] = H[T.int64(0)] * T.int64(1500)

    @T.prim_func
    def shape_func1(H: T.Buffer((T.int64(3),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(1)] = H[T.int64(0)] * T.int64(1500)

    @T.prim_func
    def shape_func2(H: T.Buffer((T.int64(5),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(4)] = T.int64(8) * H[T.int64(1)] * T.int64(4)
        H[T.int64(3)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12)
        H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12)

    @T.prim_func
    def shape_func3(H: T.Buffer((T.int64(6),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(4)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12)
        H[T.int64(3)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12)
        H[T.int64(5)] = T.int64(32) * H[T.int64(1)]

    @T.prim_func
    def shape_func4(H: T.Buffer((T.int64(3),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4)

    @T.prim_func
    def shape_func5(H: T.Buffer((T.int64(5),), "int64")):
        T.func_attr({"tir.is_host_func": 1})
        H[T.int64(2)] = T.int64(32) * ((H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096))
        H[T.int64(4)] = T.int64(32) * H[T.int64(1)]
        H[T.int64(3)] = (H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096)

    @T.prim_func
    def softmax_with_chunked_sum(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle, var_softmax: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
        A = T.match_buffer(var_A, (batch_size, vocab_size))
        temperature = T.match_buffer(var_temperature, (batch_size,))
        num_chunks = T.int64(is_size_var=True)
        chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks))
        chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks))
        softmax = T.match_buffer(var_softmax, (batch_size, vocab_size))
        # with T.block("root"):
        temp_max_shared = T.alloc_buffer((batch_size,), scope="shared")
        temp_sum_shared = T.alloc_buffer((batch_size,), scope="shared")
        for l0_l1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"):
            for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}):
                    with T.block("max"):
                        v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks)
                        v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1)
                        T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks)
                        T.reads(chunked_max[v0, v1])
                        T.writes(temp_max_shared[v0])
                        with T.init():
                            temp_max_shared[v0] = T.float32(-3.4028234663852886e+38)
                        temp_max_shared[v0] = T.max(temp_max_shared[v0], chunked_max[v0, v1])
            for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}):
                    with T.block("sum_exp"):
                        v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks)
                        v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1)
                        T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks)
                        T.reads(temperature[v0], chunked_sum[v0, v1], chunked_max[v0, v1], temp_max_shared[v0])
                        T.writes(temp_sum_shared[v0])
                        with T.init():
                            temp_sum_shared[v0] = T.float32(0)
                        temp_sum_shared[v0] = temp_sum_shared[v0] + T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(chunked_sum[v0, v1] + chunked_max[v0, v1] - temp_max_shared[v0]), T.Cast("float32", chunked_max[v0, v1] == temp_max_shared[v0]) * chunked_sum[v0, v1])
            for l2_0 in T.serial(T.int64(4), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}):
                for l2_1 in T.thread_binding(T.int64(32), thread="threadIdx.y"):
                    for l2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"):
                        with T.block("log_pad"):
                            v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks)
                            v1 = T.axis.spatial(num_chunks, l0_l1_fused % num_chunks)
                            v2 = T.axis.spatial(T.int64(4096), l2_0 * T.int64(1024) + l2_1 * T.int64(32) + l2_2)
                            T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_sum_shared[v0], temp_max_shared[v0])
                            T.writes(softmax[v0, v1 * T.int64(4096) + v2])
                            if v1 * T.int64(4096) + v2 < vocab_size:
                                softmax[v0, v1 * T.int64(4096) + v2] = T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(A[v0, v1 * T.int64(4096) + v2] / temperature[v0] - (T.log(temp_sum_shared[v0]) + temp_max_shared[v0])), T.Cast("float32", A[v0, v1 * T.int64(4096) + v2] == temp_max_shared[v0]) / temp_sum_shared[v0])

    @T.prim_func
    def take(model_decoder_embed_tokens_weight3: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), var_reshape707: T.handle, var_T_take: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        reshape707 = T.match_buffer(var_reshape707, (batch_size,), "int32")
        T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_take"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(model_decoder_embed_tokens_weight3[reshape707[v0], v1], reshape707[v0])
                    T.writes(T_take[v0, v1])
                    T_take[v0, v1] = model_decoder_embed_tokens_weight3[reshape707[v0], v1]

    @T.prim_func
    def take1(model_decoder_embed_positions_weight3: T.Buffer((T.int64(448), T.int64(1280)), "float16"), var_lv133: T.handle, var_T_take: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size = T.int64()
        lv133 = T.match_buffer(var_lv133, (batch_size,), "int32")
        T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_take"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(model_decoder_embed_positions_weight3[lv133[v0], v1], lv133[v0])
                    T.writes(T_take[v0, v1])
                    T_take[v0, v1] = model_decoder_embed_positions_weight3[lv133[v0], v1]

    @T.prim_func
    def take2(var_layer_norm161: T.handle, var_logit_positions: T.handle, var_T_take: T.handle):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        seq_len = T.int64()
        layer_norm161 = T.match_buffer(var_layer_norm161, (T.int64(1), seq_len, T.int64(1280)), "float16")
        batch_size = T.int64()
        logit_positions = T.match_buffer(var_logit_positions, (batch_size,), "int32")
        T_take = T.match_buffer(var_T_take, (T.int64(1), batch_size, T.int64(1280)), "float16")
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_take"):
                    v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280))
                    v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280))
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280))
                    T.reads(layer_norm161[T.int64(0), logit_positions[v0], v1], logit_positions[v0])
                    T.writes(T_take[T.int64(0), v0, v1])
                    T_take[T.int64(0), v0, v1] = layer_norm161[T.int64(0), logit_positions[v0], v1]

    @T.prim_func
    def take3(model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), reshape1353: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_take"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0], reshape1353[T.int64(0)])
                    T.writes(T_take[T.int64(0), v0])
                    T_take[T.int64(0), v0] = model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0]

    @T.prim_func
    def take4(model_decoder_embed_positions_weight5: T.Buffer((T.int64(448), T.int64(1280)), "float16"), lv264: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")):
        T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        # with T.block("root"):
        for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"):
            for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("T_take"):
                    v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1)
                    T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280))
                    T.reads(model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0], lv264[T.int64(0)])
                    T.writes(T_take[T.int64(0), v0])
                    T_take[T.int64(0), v0] = model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0]

    @T.prim_func
    def take_sorted_probs(var_probs: T.handle, var_lv1: T.handle, var_take_sorted_probs: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        batch_size, vocab_size = T.int64(), T.int64()
        probs = T.match_buffer(var_probs, (batch_size, vocab_size))
        lv1 = T.match_buffer(var_lv1, (batch_size, vocab_size), "int32")
        batch_size_1, vocab_size_1 = T.int64(), T.int64()
        take_sorted_probs = T.match_buffer(var_take_sorted_probs, (batch_size_1, vocab_size_1))
        # with T.block("root"):
        for ax0_ax1_fused_0 in T.thread_binding((batch_size_1 * vocab_size_1 + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("take_sorted_probs"):
                    v0 = T.axis.spatial(batch_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size_1 * batch_size_1) // vocab_size_1)
                    v1 = T.axis.spatial(vocab_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size_1)
                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size_1 * vocab_size_1)
                    T.reads(probs[v0, lv1[v0, v1]], lv1[v0, v1])
                    T.writes(take_sorted_probs[v0, v1])
                    take_sorted_probs[v0, v1] = probs[v0, lv1[v0, v1]]

    @T.prim_func
    def tir_kv_cache_debug_get_kv(var_pages: T.handle, var_position_map: T.handle, var_k_data: T.handle, var_v_data: T.handle, layer_id: T.int64):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        num_pages, page_size = T.int64(), T.int64(is_size_var=True)
        pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16")
        seqlen = T.int64(is_size_var=True)
        position_map = T.match_buffer(var_position_map, (seqlen,), "int32", offset_factor=1)
        k_data = T.match_buffer(var_k_data, (32, seqlen, 20, 64), "float16")
        v_data = T.match_buffer(var_v_data, (32, seqlen, 20, 64), "float16")
        # with T.block("root"):
        for p_h_d_fused_0 in T.thread_binding((seqlen * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for p_h_d_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                with T.block("copy0"):
                    vp = T.axis.spatial(seqlen, (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) // T.int64(1280))
                    vh = T.axis.spatial(20, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(1280) // T.int64(64)))
                    vd = T.axis.spatial(64, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(64)))
                    T.where(p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1 < seqlen * T.int64(1280))
                    T.reads(position_map[vp], pages[T.Cast("int64", position_map[vp]) // page_size, 0:2, vh, T.Cast("int64", position_map[vp]) % page_size, vd])
                    T.writes(k_data[layer_id, vp, vh, vd], v_data[layer_id, vp, vh, vd])
                    position: T.int32 = position_map[vp]
                    k_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 0, vh, T.Cast("int64", position) % page_size, vd]
                    v_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 1, vh, T.Cast("int64", position) % page_size, vd]

    @T.prim_func
    def tir_kv_cache_transpose_append(var_pages: T.handle, var_k_data: T.handle, var_v_data: T.handle, var_position_map: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        num_pages = T.int64()
        pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16")
        ntoken = T.int64(is_size_var=True)
        k_data = T.match_buffer(var_k_data, (ntoken, 20, 64), "float16")
        v_data = T.match_buffer(var_v_data, (ntoken, 20, 64), "float16")
        position_map = T.match_buffer(var_position_map, (ntoken,), "int32", offset_factor=1)
        # with T.block("root"):
        for global_pos_h_f_fused_0 in T.thread_binding((ntoken * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"):
            for global_pos_h_f_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
                if position_map[(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)] != -1:
                    with T.block("k_transpose_append"):
                        vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280))
                        vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64)))
                        vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64)))
                        T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280))
                        T.reads(position_map[vgpos], k_data[vgpos, vh, vf])
                        T.writes(pages[position_map[vgpos] // 16, 0, vh, position_map[vgpos] % 16, vf])
                        position: T.int32 = position_map[vgpos]
                        pages[position // 16, 0, vh, position % 16, vf] = k_data[vgpos, vh, vf]
                    with T.block("v_transpose_append"):
                        vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280))
                        vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64)))
                        vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64)))
                        T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280))
                        T.reads(position_map[vgpos], v_data[vgpos, vh, vf])
                        T.writes(pages[position_map[vgpos] // 16, 1, vh, position_map[vgpos] % 16, vf])
                        position: T.int32 = position_map[vgpos]
                        pages[position // 16, 1, vh, position % 16, vf] = v_data[vgpos, vh, vf]

    @T.prim_func
    def top_p_pivot_cutoff(var_prob: T.handle, var_top_p_arr: T.handle, var_init_pivots: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        B, N = T.int32(), T.int32()
        prob = T.match_buffer(var_prob, (B, N))
        top_p_arr = T.match_buffer(var_top_p_arr, (B,))
        init_pivots = T.match_buffer(var_init_pivots, (B, 3))
        final_pivot = T.match_buffer(var_final_pivot, (B,))
        final_lsum = T.match_buffer(var_final_lsum, (B,))
        # with T.block("root"):
        pivot = T.alloc_buffer((3,), scope="local")
        top_p = T.alloc_buffer((1,), scope="local")
        L = T.alloc_buffer((1,), scope="shared")
        R_1 = T.alloc_buffer((1,), scope="shared")
        L_local = T.alloc_buffer((1,), scope="local")
        R_local = T.alloc_buffer((1,), scope="local")
        q = T.alloc_buffer((1,), scope="local")
        lsum = T.alloc_buffer((3,), scope="local")
        lmin_broadcast = T.alloc_buffer((1,), scope="shared")
        lmin_broadcast_local = T.alloc_buffer((1,), scope="local")
        lmin = T.alloc_buffer((3,), scope="local")
        cmin = T.alloc_buffer((3,), "int32", scope="local")
        total_sum = T.alloc_buffer((1,), scope="local")
        it = T.alloc_buffer((1,), "int32", scope="local")
        es_local = T.alloc_buffer((1,), "bool", scope="local")
        es = T.alloc_buffer((1,), "bool", scope="shared")
        find_pivot_local = T.alloc_buffer((1,), "bool", scope="local")
        find_pivot = T.alloc_buffer((1,), "bool", scope="shared")
        total_sum_reduce = T.alloc_buffer((1,), scope="local")
        lsum_reduce = T.alloc_buffer((1,), scope="local")
        lmin_reduce = T.alloc_buffer((1,), scope="local")
        cmin_reduce = T.alloc_buffer((1,), "int32", scope="local")
        for _bx in T.thread_binding(B, thread="blockIdx.x"):
            for _tx in T.thread_binding(1024, thread="threadIdx.x"):
                with T.block("CTA"):
                    b, tx = T.axis.remap("SS", [_bx, _tx])
                    T.reads(top_p_arr[b], top_p[0], L[0], R_1[0], init_pivots[b, 0:3], L_local[0], R_local[0], find_pivot_local[0], it[0], es_local[0], prob[b, it[0] * 1024 + tx], total_sum[0], q[0], pivot[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lsum[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], cmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], total_sum_reduce[0], es[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], lsum_reduce[0], cmin_reduce[0], find_pivot[0])
                    T.writes(top_p[0], L[0], R_1[0], find_pivot[0], L_local[0], R_local[0], pivot[0:3], find_pivot_local[0], final_lsum[b], final_pivot[b], lsum[0:3], lmin[0:3], cmin[0:3], total_sum[0], it[0], es_local[0], q[0], total_sum_reduce[0], es[0], lsum_reduce[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], cmin_reduce[0])
                    top_p[0] = top_p_arr[b]
                    if tx == 0:
                        L[0] = T.float32(1) - top_p[0]
                        R_1[0] = T.float32(9.9999999999999995e-08)
                        find_pivot[0] = T.bool(False)
                    T.tvm_storage_sync("shared")
                    L_local[0] = L[0]
                    R_local[0] = R_1[0]
                    for i in T.unroll(3):
                        pivot[i] = init_pivots[b, i]
                    find_pivot_local[0] = T.bool(False)
                    if L_local[0] - R_local[0] <= T.float32(9.9999999999999995e-08):
                        if tx == 0:
                            final_lsum[b] = T.float32(1)
                            final_pivot[b] = T.float32(0)
                        find_pivot_local[0] = T.bool(True)
                    while T.tvm_thread_invariant(L_local[0] - R_local[0] > T.float32(9.9999999999999995e-08) and not find_pivot_local[0]):
                        T.tvm_storage_sync("shared")
                        for pidx in T.unroll(3):
                            lsum[pidx] = T.float32(0)
                            lmin[pidx] = T.float32(3.4028234663852886e+38)
                            cmin[pidx] = 0
                        total_sum[0] = T.float32(0)
                        it[0] = 0
                        es_local[0] = T.bool(False)
                        while it[0] < (N + 1024 - 1) // 1024 and not es_local[0]:
                            q[0] = T.if_then_else(it[0] * 1024 + tx < N, prob[b, it[0] * 1024 + tx], T.float32(0))
                            total_sum[0] = total_sum[0] + q[0]
                            for pidx in T.unroll(3):
                                if q[0] >= pivot[pidx]:
                                    lsum[pidx] = lsum[pidx] + q[0]
                                    if lmin[pidx] > q[0]:
                                        lmin[pidx] = q[0]
                                        cmin[pidx] = 1
                                    else:
                                        if lmin[pidx] == q[0]:
                                            cmin[pidx] = cmin[pidx] + 1
                            it[0] = it[0] + 1
                            if it[0] % 32 == 0:
                                with T.block("block_cross_thread"):
                                    T.reads(total_sum[0])
                                    T.writes(total_sum_reduce[0])
                                    T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                    T.tvm_thread_allreduce(T.uint32(1), total_sum[0], T.bool(True), total_sum_reduce[0], tx)
                                if tx == 0:
                                    es[0] = T.float32(1) - total_sum_reduce[0] < pivot[2]
                                T.tvm_storage_sync("shared")
                                es_local[0] = es[0]
                        T.tvm_storage_sync("shared")
                        for pidx in range(3):
                            with T.block("block_cross_thread"):
                                T.reads(lsum[pidx])
                                T.writes(lsum_reduce[0])
                                T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                T.tvm_thread_allreduce(T.uint32(1), lsum[pidx], T.bool(True), lsum_reduce[0], tx)
                            with T.block("block_cross_thread"):
                                T.reads(lmin[pidx])
                                T.writes(lmin_reduce[0])
                                T.attr(T.comm_reducer(lambda x0, y0: T.min(x0, y0), [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                T.tvm_thread_allreduce(T.uint32(1), lmin[pidx], T.bool(True), lmin_reduce[0], tx)
                            if tx == 0:
                                lmin_broadcast[0] = lmin_reduce[0]
                            T.tvm_storage_sync("shared")
                            lmin_broadcast_local[0] = lmin_broadcast[0]
                            if lmin[pidx] > lmin_broadcast_local[0]:
                                cmin[pidx] = 0
                            if tx == 0:
                                lsum[pidx] = lsum_reduce[0]
                                lmin[pidx] = lmin_reduce[0]
                            with T.block("block_cross_thread"):
                                T.reads(cmin[pidx])
                                T.writes(cmin_reduce[0])
                                T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [0]), "reduce_scope", T.reinterpret("handle", T.uint64(0)))
                                T.tvm_thread_allreduce(T.uint32(1), cmin[pidx], T.bool(True), cmin_reduce[0], tx)
                            if tx == 0:
                                cmin[pidx] = cmin_reduce[0]
                        T.tvm_storage_sync("shared")
                        if tx == 0:
                            it[0] = 0
                            while it[0] < 3 and not find_pivot_local[0]:
                                if lsum[it[0]] >= top_p[0] and top_p[0] > lsum[it[0]] - T.Cast("float32", cmin[it[0]]) * lmin[it[0]]:
                                    find_pivot[0] = T.bool(True)
                                    find_pivot_local[0] = T.bool(True)
                                    final_pivot[b] = pivot[it[0]]
                                    final_lsum[b] = lsum[it[0]]
                                else:
                                    if lsum[it[0]] - lmin[it[0]] * T.Cast("float32", cmin[it[0]]) >= top_p[0]:
                                        R_1[0] = pivot[it[0]]
                                        final_lsum[b] = lsum[it[0]]
                                    else:
                                        if lsum[it[0]] < top_p[0]:
                                            L[0] = pivot[it[0]]
                                it[0] = it[0] + 1
                        T.tvm_storage_sync("shared")
                        L_local[0] = L[0]
                        R_local[0] = R_1[0]
                        find_pivot_local[0] = find_pivot[0]
                        for pidx in T.unroll(3):
                            pivot[pidx] = L[0] - T.Cast("float32", pidx + 1) * (L_local[0] - R_local[0]) / T.float32(4)
                    if tx == 0:
                        if not find_pivot_local[0]:
                            final_pivot[b] = R_local[0]
                            if R_local[0] == T.float32(9.9999999999999995e-08):
                                final_lsum[b] = lsum[2]

    @T.prim_func
    def top_p_renorm_after_cutoff(var_prob: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle, var_renorm_prob: T.handle):
        T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
        B, N = T.int32(), T.int32()
        prob = T.match_buffer(var_prob, (B, N))
        final_pivot = T.match_buffer(var_final_pivot, (B,))
        final_lsum = T.match_buffer(var_final_lsum, (B,))
        renorm_prob = T.match_buffer(var_renorm_prob, (B, N))
        # with T.block("root"):
        pivot = T.alloc_buffer((1,), scope="local")
        lsum = T.alloc_buffer((1,), scope="local")
        for _by in T.thread_binding(B, thread="blockIdx.y"):
            for _bx in T.thread_binding((B + 511) // B, thread="blockIdx.x"):
                for _tx in T.thread_binding(1024, thread="threadIdx.x"):
                    with T.block("CTA"):
                        by, bx, tx = T.axis.remap("SSS", [_by, _bx, _tx])
                        T.reads(final_pivot[by], final_lsum[by], prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)], pivot[0], lsum[0])
                        T.writes(pivot[0], lsum[0], renorm_prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)])
                        pivot[0] = final_pivot[by]
                        lsum[0] = final_lsum[by]
                        for i in range(((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024)):
                            if i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx < N:
                                renorm_prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] = T.if_then_else(prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] >= pivot[0], prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] / lsum[0], T.float32(0))

    @R.function
    def _metadata() -> R.Object:
        shape_heap: R.Object = R.null_value()
        return R.str("{\"model_type\": \"whisper\", \"quantization\": \"q0f16\", \"context_window_size\": 1500, \"sliding_window_size\": -1, \"attention_sink_size\": -1, \"prefill_chunk_size\": 15000, \"tensor_parallel_shards\": 1, \"kv_state_kind\": \"kv_cache\", \"max_batch_size\": 8, \"params\": [{\"name\": \"model.encoder.conv1.weight\", \"shape\": [1280, 128, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv1.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.weight\", \"shape\": [1280, 1280, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.embed_positions.weight\", \"shape\": [1500, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_tokens.weight\", \"shape\": [51866, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_positions.weight\", \"shape\": [448, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}], \"kv_cache\": {\"num_hidden_layers\": 32, \"num_attention_heads\": 20, \"num_key_value_heads\": 20, \"head_dim\": 64}, \"memory_usage\": {\"argsort_probs\": 0, \"batch_compute_cross_attn_kv\": 61440000, \"batch_decode\": 1987392, \"batch_encode\": 276480000, \"batch_prefill\": 616080192, \"create_tir_paged_kv_cache\": 0, \"decode\": 243304, \"multinomial_from_uniform\": 32, \"prefill\": 614610024, \"renormalize_by_top_p\": 64, \"sample_with_top_p\": 64, \"sampler_take_probs\": 416, \"sampler_verify_draft_tokens\": 0, \"softmax_with_temperature\": 0}}")

    @R.function
    def argsort_probs(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32")) -> R.Tuple(R.Tensor(("batch_size", "vocab_size"), dtype="float32"), R.Tensor(("batch_size", "vocab_size"), dtype="int32")):
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func2(shape_heap)
        gv2560: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        storage30: R.Object = R.vm.alloc_storage(gv2560, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2561: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),))
        lv: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage30, R.prim_value(0), gv2561, R.dtype("uint8"))
        R.vm.kill_object(storage30)
        gv2562: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),))
        storage31: R.Object = R.vm.alloc_storage(gv2562, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2563: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1976: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage31, R.prim_value(0), gv2563, R.dtype("int32"))
        R.vm.kill_object(storage31)
        cls.argsort_thrust(probs, lv, alloc1976)
        R.vm.kill_object(lv)
        gv2564: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),))
        storage32: R.Object = R.vm.alloc_storage(gv2564, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2565: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1977: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage32, R.prim_value(0), gv2565, R.dtype("float32"))
        R.vm.kill_object(storage32)
        cls.take_sorted_probs(probs, alloc1976, alloc1977)
        gv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="int32", ndim=2)) = alloc1977, alloc1976
        R.vm.kill_object(alloc1976)
        R.vm.kill_object(alloc1977)
        gv2566: R.Tensor(dtype="float32", ndim=2) = gv1[0]
        R.call_packed("vm.builtin.match_shape", gv2566, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,))
        gv2567: R.Tensor(dtype="int32", ndim=2) = gv1[1]
        R.call_packed("vm.builtin.match_shape", gv2567, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,))
        return gv1

    @R.function
    def batch_compute_cross_attn_kv(encoder_hidden_states: R.Tensor(("batch_size", 1500, 1280), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Object:
        batch_size = T.int64()
        R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", encoder_hidden_states, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", encoder_hidden_states, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func(shape_heap)
        model_decoder_layers_0_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[498]
        storage11: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc554: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv883, R.dtype("float16"))
        _552: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_0_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc554)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_k_proj_weight1)
        gv884: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape256: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc554, gv884, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc554)
        model_decoder_layers_0_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[499]
        model_decoder_layers_0_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[500]
        storage12: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc555: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv885, R.dtype("float16"))
        _553: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_0_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_0_encoder_attn_v_proj_bias1, alloc555)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_bias1)
        gv886: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape257: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc555, gv886, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc555)
        gv887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape258: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape256, gv887, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape256)
        gv888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape259: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape257, gv888, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape257)
        lv36: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", paged_kv_cache, R.prim_value(0), reshape258, reshape259, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape258)
        R.vm.kill_object(reshape259)
        model_decoder_layers_1_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[522]
        gv889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc556: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv889, R.dtype("float16"))
        _554: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_1_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc556)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_k_proj_weight1)
        gv890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape260: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc556, gv890, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc556)
        model_decoder_layers_1_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[523]
        model_decoder_layers_1_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[524]
        gv891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc557: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv891, R.dtype("float16"))
        _555: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_1_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_1_encoder_attn_v_proj_bias1, alloc557)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_bias1)
        gv892: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape261: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc557, gv892, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc557)
        gv893: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape262: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape260, gv893, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape260)
        gv894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape263: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape261, gv894, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape261)
        lv37: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv36, R.prim_value(1), reshape262, reshape263, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape262)
        R.vm.kill_object(reshape263)
        R.vm.kill_object(lv36)
        model_decoder_layers_2_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[546]
        gv895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc558: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv895, R.dtype("float16"))
        _556: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_2_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc558)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_k_proj_weight1)
        gv896: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape264: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc558, gv896, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc558)
        model_decoder_layers_2_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[547]
        model_decoder_layers_2_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[548]
        gv897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc559: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv897, R.dtype("float16"))
        _557: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_2_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_2_encoder_attn_v_proj_bias1, alloc559)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_bias1)
        gv898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape265: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc559, gv898, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc559)
        gv899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape266: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape264, gv899, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape264)
        gv900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape267: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape265, gv900, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape265)
        lv38: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv37, R.prim_value(2), reshape266, reshape267, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape266)
        R.vm.kill_object(reshape267)
        R.vm.kill_object(lv37)
        model_decoder_layers_3_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[570]
        gv901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc560: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv901, R.dtype("float16"))
        _558: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_3_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc560)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_k_proj_weight1)
        gv902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape268: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc560, gv902, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc560)
        model_decoder_layers_3_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[571]
        model_decoder_layers_3_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[572]
        gv903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc561: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv903, R.dtype("float16"))
        _559: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_3_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_3_encoder_attn_v_proj_bias1, alloc561)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_bias1)
        gv904: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape269: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc561, gv904, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc561)
        gv905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape270: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape268, gv905, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape268)
        gv906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape271: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape269, gv906, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape269)
        lv39: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv38, R.prim_value(3), reshape270, reshape271, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape270)
        R.vm.kill_object(reshape271)
        R.vm.kill_object(lv38)
        model_decoder_layers_4_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[594]
        gv907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc562: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv907, R.dtype("float16"))
        _560: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_4_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc562)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_k_proj_weight1)
        gv908: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape272: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc562, gv908, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc562)
        model_decoder_layers_4_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[595]
        model_decoder_layers_4_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[596]
        gv909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc563: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv909, R.dtype("float16"))
        _561: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_4_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_4_encoder_attn_v_proj_bias1, alloc563)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_bias1)
        gv910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape273: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc563, gv910, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc563)
        gv911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape274: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape272, gv911, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape272)
        gv912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape275: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape273, gv912, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape273)
        lv40: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv39, R.prim_value(4), reshape274, reshape275, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape274)
        R.vm.kill_object(reshape275)
        R.vm.kill_object(lv39)
        model_decoder_layers_5_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[618]
        gv913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc564: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv913, R.dtype("float16"))
        _562: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_5_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc564)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_k_proj_weight1)
        gv914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape276: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc564, gv914, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc564)
        model_decoder_layers_5_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[619]
        model_decoder_layers_5_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[620]
        gv915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc565: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv915, R.dtype("float16"))
        _563: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_5_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_5_encoder_attn_v_proj_bias1, alloc565)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_bias1)
        gv916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape277: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc565, gv916, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc565)
        gv917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape278: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape276, gv917, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape276)
        gv918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape279: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape277, gv918, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape277)
        lv41: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv40, R.prim_value(5), reshape278, reshape279, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape278)
        R.vm.kill_object(reshape279)
        R.vm.kill_object(lv40)
        model_decoder_layers_6_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[642]
        gv919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc566: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv919, R.dtype("float16"))
        _564: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_6_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc566)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_k_proj_weight1)
        gv920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape280: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc566, gv920, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc566)
        model_decoder_layers_6_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[643]
        model_decoder_layers_6_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[644]
        gv921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc567: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv921, R.dtype("float16"))
        _565: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_6_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_6_encoder_attn_v_proj_bias1, alloc567)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_bias1)
        gv922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape281: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc567, gv922, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc567)
        gv923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape282: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape280, gv923, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape280)
        gv924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape283: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape281, gv924, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape281)
        lv42: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv41, R.prim_value(6), reshape282, reshape283, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape282)
        R.vm.kill_object(reshape283)
        R.vm.kill_object(lv41)
        model_decoder_layers_7_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[666]
        gv925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc568: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv925, R.dtype("float16"))
        _566: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_7_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc568)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_k_proj_weight1)
        gv926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape284: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc568, gv926, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc568)
        model_decoder_layers_7_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[667]
        model_decoder_layers_7_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[668]
        gv927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc569: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv927, R.dtype("float16"))
        _567: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_7_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_7_encoder_attn_v_proj_bias1, alloc569)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_bias1)
        gv928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape285: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc569, gv928, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc569)
        gv929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape286: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape284, gv929, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape284)
        gv930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape287: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape285, gv930, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape285)
        lv43: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv42, R.prim_value(7), reshape286, reshape287, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape286)
        R.vm.kill_object(reshape287)
        R.vm.kill_object(lv42)
        model_decoder_layers_8_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[690]
        gv931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc570: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv931, R.dtype("float16"))
        _568: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_8_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc570)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_k_proj_weight1)
        gv932: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape288: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc570, gv932, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc570)
        model_decoder_layers_8_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[691]
        model_decoder_layers_8_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[692]
        gv933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc571: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv933, R.dtype("float16"))
        _569: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_8_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_8_encoder_attn_v_proj_bias1, alloc571)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_bias1)
        gv934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape289: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc571, gv934, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc571)
        gv935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape290: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape288, gv935, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape288)
        gv936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape291: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape289, gv936, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape289)
        lv44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv43, R.prim_value(8), reshape290, reshape291, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape290)
        R.vm.kill_object(reshape291)
        R.vm.kill_object(lv43)
        model_decoder_layers_9_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[714]
        gv937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc572: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv937, R.dtype("float16"))
        _570: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_9_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc572)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_k_proj_weight1)
        gv938: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape292: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc572, gv938, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc572)
        model_decoder_layers_9_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[715]
        model_decoder_layers_9_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[716]
        gv939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc573: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv939, R.dtype("float16"))
        _571: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_9_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_9_encoder_attn_v_proj_bias1, alloc573)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_bias1)
        gv940: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape293: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc573, gv940, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc573)
        gv941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape294: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape292, gv941, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape292)
        gv942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape295: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape293, gv942, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape293)
        lv45: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv44, R.prim_value(9), reshape294, reshape295, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape294)
        R.vm.kill_object(reshape295)
        R.vm.kill_object(lv44)
        model_decoder_layers_10_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[738]
        gv943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc574: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv943, R.dtype("float16"))
        _572: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_10_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc574)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_k_proj_weight1)
        gv944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape296: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc574, gv944, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc574)
        model_decoder_layers_10_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[739]
        model_decoder_layers_10_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[740]
        gv945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc575: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv945, R.dtype("float16"))
        _573: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_10_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_10_encoder_attn_v_proj_bias1, alloc575)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_bias1)
        gv946: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape297: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc575, gv946, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc575)
        gv947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape298: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape296, gv947, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape296)
        gv948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape299: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape297, gv948, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape297)
        lv46: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv45, R.prim_value(10), reshape298, reshape299, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape298)
        R.vm.kill_object(reshape299)
        R.vm.kill_object(lv45)
        model_decoder_layers_11_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[762]
        gv949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc576: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv949, R.dtype("float16"))
        _574: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_11_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc576)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_k_proj_weight1)
        gv950: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape300: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc576, gv950, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc576)
        model_decoder_layers_11_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[763]
        model_decoder_layers_11_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[764]
        gv951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc577: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv951, R.dtype("float16"))
        _575: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_11_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_11_encoder_attn_v_proj_bias1, alloc577)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_bias1)
        gv952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape301: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc577, gv952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc577)
        gv953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape302: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape300, gv953, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape300)
        gv954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape303: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape301, gv954, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape301)
        lv47: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv46, R.prim_value(11), reshape302, reshape303, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape302)
        R.vm.kill_object(reshape303)
        R.vm.kill_object(lv46)
        model_decoder_layers_12_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[786]
        gv955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc578: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv955, R.dtype("float16"))
        _576: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_12_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc578)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_k_proj_weight1)
        gv956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape304: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc578, gv956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc578)
        model_decoder_layers_12_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[787]
        model_decoder_layers_12_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[788]
        gv957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc579: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv957, R.dtype("float16"))
        _577: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_12_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_12_encoder_attn_v_proj_bias1, alloc579)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_bias1)
        gv958: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape305: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc579, gv958, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc579)
        gv959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape306: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape304, gv959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape304)
        gv960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape307: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape305, gv960, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape305)
        lv48: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv47, R.prim_value(12), reshape306, reshape307, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape306)
        R.vm.kill_object(reshape307)
        R.vm.kill_object(lv47)
        model_decoder_layers_13_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[810]
        gv961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc580: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv961, R.dtype("float16"))
        _578: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_13_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc580)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_k_proj_weight1)
        gv962: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape308: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc580, gv962, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc580)
        model_decoder_layers_13_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[811]
        model_decoder_layers_13_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[812]
        gv963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc581: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv963, R.dtype("float16"))
        _579: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_13_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_13_encoder_attn_v_proj_bias1, alloc581)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_bias1)
        gv964: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape309: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc581, gv964, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc581)
        gv965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape310: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape308, gv965, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape308)
        gv966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape311: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape309, gv966, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape309)
        lv49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv48, R.prim_value(13), reshape310, reshape311, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape310)
        R.vm.kill_object(reshape311)
        R.vm.kill_object(lv48)
        model_decoder_layers_14_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[834]
        gv967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc582: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv967, R.dtype("float16"))
        _580: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_14_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc582)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_k_proj_weight1)
        gv968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape312: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc582, gv968, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc582)
        model_decoder_layers_14_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[835]
        model_decoder_layers_14_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[836]
        gv969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc583: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv969, R.dtype("float16"))
        _581: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_14_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_14_encoder_attn_v_proj_bias1, alloc583)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_bias1)
        gv970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape313: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc583, gv970, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc583)
        gv971: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape314: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape312, gv971, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape312)
        gv972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape315: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape313, gv972, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape313)
        lv50: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv49, R.prim_value(14), reshape314, reshape315, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape314)
        R.vm.kill_object(reshape315)
        R.vm.kill_object(lv49)
        model_decoder_layers_15_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[858]
        gv973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc584: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv973, R.dtype("float16"))
        _582: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_15_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc584)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_k_proj_weight1)
        gv974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape316: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc584, gv974, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc584)
        model_decoder_layers_15_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[859]
        model_decoder_layers_15_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[860]
        gv975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc585: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv975, R.dtype("float16"))
        _583: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_15_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_15_encoder_attn_v_proj_bias1, alloc585)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_bias1)
        gv976: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape317: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc585, gv976, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc585)
        gv977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape318: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape316, gv977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape316)
        gv978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape319: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape317, gv978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape317)
        lv51: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv50, R.prim_value(15), reshape318, reshape319, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape318)
        R.vm.kill_object(reshape319)
        R.vm.kill_object(lv50)
        model_decoder_layers_16_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[882]
        gv979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc586: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv979, R.dtype("float16"))
        _584: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_16_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc586)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_k_proj_weight1)
        gv980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape320: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc586, gv980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc586)
        model_decoder_layers_16_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[883]
        model_decoder_layers_16_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[884]
        gv981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc587: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv981, R.dtype("float16"))
        _585: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_16_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_16_encoder_attn_v_proj_bias1, alloc587)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_bias1)
        gv982: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape321: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc587, gv982, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc587)
        gv983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape322: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape320, gv983, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape320)
        gv984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape323: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape321, gv984, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape321)
        lv52: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv51, R.prim_value(16), reshape322, reshape323, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape322)
        R.vm.kill_object(reshape323)
        R.vm.kill_object(lv51)
        model_decoder_layers_17_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[906]
        gv985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc588: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv985, R.dtype("float16"))
        _586: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_17_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc588)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_k_proj_weight1)
        gv986: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape324: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc588, gv986, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc588)
        model_decoder_layers_17_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[907]
        model_decoder_layers_17_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[908]
        gv987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc589: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv987, R.dtype("float16"))
        _587: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_17_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_17_encoder_attn_v_proj_bias1, alloc589)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_bias1)
        gv988: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape325: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc589, gv988, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc589)
        gv989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape326: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape324, gv989, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape324)
        gv990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape327: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape325, gv990, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape325)
        lv53: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv52, R.prim_value(17), reshape326, reshape327, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape326)
        R.vm.kill_object(reshape327)
        R.vm.kill_object(lv52)
        model_decoder_layers_18_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[930]
        gv991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc590: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv991, R.dtype("float16"))
        _588: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_18_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc590)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_k_proj_weight1)
        gv992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape328: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc590, gv992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc590)
        model_decoder_layers_18_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[931]
        model_decoder_layers_18_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[932]
        gv993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc591: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv993, R.dtype("float16"))
        _589: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_18_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_18_encoder_attn_v_proj_bias1, alloc591)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_bias1)
        gv994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape329: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc591, gv994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc591)
        gv995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape330: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape328, gv995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape328)
        gv996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape331: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape329, gv996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape329)
        lv54: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv53, R.prim_value(18), reshape330, reshape331, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape330)
        R.vm.kill_object(reshape331)
        R.vm.kill_object(lv53)
        model_decoder_layers_19_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[954]
        gv997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc592: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv997, R.dtype("float16"))
        _590: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_19_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc592)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_k_proj_weight1)
        gv998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape332: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc592, gv998, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc592)
        model_decoder_layers_19_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[955]
        model_decoder_layers_19_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[956]
        gv999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc593: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv999, R.dtype("float16"))
        _591: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_19_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_19_encoder_attn_v_proj_bias1, alloc593)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_bias1)
        gv1000: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape333: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc593, gv1000, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc593)
        gv1001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape334: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape332, gv1001, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape332)
        gv1002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape335: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape333, gv1002, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape333)
        lv55: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv54, R.prim_value(19), reshape334, reshape335, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape334)
        R.vm.kill_object(reshape335)
        R.vm.kill_object(lv54)
        model_decoder_layers_20_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[978]
        gv1003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc594: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1003, R.dtype("float16"))
        _592: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_20_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc594)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_k_proj_weight1)
        gv1004: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape336: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc594, gv1004, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc594)
        model_decoder_layers_20_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[979]
        model_decoder_layers_20_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[980]
        gv1005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc595: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1005, R.dtype("float16"))
        _593: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_20_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_20_encoder_attn_v_proj_bias1, alloc595)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_bias1)
        gv1006: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape337: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc595, gv1006, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc595)
        gv1007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape338: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape336, gv1007, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape336)
        gv1008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape339: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape337, gv1008, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape337)
        lv56: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv55, R.prim_value(20), reshape338, reshape339, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape338)
        R.vm.kill_object(reshape339)
        R.vm.kill_object(lv55)
        model_decoder_layers_21_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1002]
        gv1009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc596: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1009, R.dtype("float16"))
        _594: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_21_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc596)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_k_proj_weight1)
        gv1010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape340: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc596, gv1010, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc596)
        model_decoder_layers_21_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1003]
        model_decoder_layers_21_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1004]
        gv1011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc597: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1011, R.dtype("float16"))
        _595: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_21_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_21_encoder_attn_v_proj_bias1, alloc597)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_bias1)
        gv1012: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape341: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc597, gv1012, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc597)
        gv1013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape342: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape340, gv1013, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape340)
        gv1014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape343: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape341, gv1014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape341)
        lv57: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv56, R.prim_value(21), reshape342, reshape343, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape342)
        R.vm.kill_object(reshape343)
        R.vm.kill_object(lv56)
        model_decoder_layers_22_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1026]
        gv1015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc598: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1015, R.dtype("float16"))
        _596: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_22_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc598)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_k_proj_weight1)
        gv1016: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape344: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc598, gv1016, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc598)
        model_decoder_layers_22_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1027]
        model_decoder_layers_22_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1028]
        gv1017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc599: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1017, R.dtype("float16"))
        _597: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_22_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_22_encoder_attn_v_proj_bias1, alloc599)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_bias1)
        gv1018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape345: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc599, gv1018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc599)
        gv1019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape346: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape344, gv1019, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape344)
        gv1020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape347: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape345, gv1020, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape345)
        lv58: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv57, R.prim_value(22), reshape346, reshape347, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape346)
        R.vm.kill_object(reshape347)
        R.vm.kill_object(lv57)
        model_decoder_layers_23_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1050]
        gv1021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc600: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1021, R.dtype("float16"))
        _598: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_23_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc600)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_k_proj_weight1)
        gv1022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape348: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc600, gv1022, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc600)
        model_decoder_layers_23_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1051]
        model_decoder_layers_23_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1052]
        gv1023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc601: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1023, R.dtype("float16"))
        _599: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_23_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_23_encoder_attn_v_proj_bias1, alloc601)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_bias1)
        gv1024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape349: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc601, gv1024, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc601)
        gv1025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape350: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape348, gv1025, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape348)
        gv1026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape351: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape349, gv1026, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape349)
        lv59: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv58, R.prim_value(23), reshape350, reshape351, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape350)
        R.vm.kill_object(reshape351)
        R.vm.kill_object(lv58)
        model_decoder_layers_24_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1074]
        gv1027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc602: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1027, R.dtype("float16"))
        _600: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_24_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc602)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_k_proj_weight1)
        gv1028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape352: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc602, gv1028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc602)
        model_decoder_layers_24_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1075]
        model_decoder_layers_24_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1076]
        gv1029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc603: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1029, R.dtype("float16"))
        _601: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_24_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_24_encoder_attn_v_proj_bias1, alloc603)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_bias1)
        gv1030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape353: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc603, gv1030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc603)
        gv1031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape354: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape352, gv1031, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape352)
        gv1032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape355: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape353, gv1032, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape353)
        lv60: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv59, R.prim_value(24), reshape354, reshape355, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape354)
        R.vm.kill_object(reshape355)
        R.vm.kill_object(lv59)
        model_decoder_layers_25_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1098]
        gv1033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc604: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1033, R.dtype("float16"))
        _602: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_25_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc604)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_k_proj_weight1)
        gv1034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape356: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc604, gv1034, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc604)
        model_decoder_layers_25_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1099]
        model_decoder_layers_25_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1100]
        gv1035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc605: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1035, R.dtype("float16"))
        _603: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_25_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_25_encoder_attn_v_proj_bias1, alloc605)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_bias1)
        gv1036: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape357: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc605, gv1036, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc605)
        gv1037: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape358: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape356, gv1037, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape356)
        gv1038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape359: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape357, gv1038, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape357)
        lv61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv60, R.prim_value(25), reshape358, reshape359, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape358)
        R.vm.kill_object(reshape359)
        R.vm.kill_object(lv60)
        model_decoder_layers_26_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1122]
        gv1039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc606: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1039, R.dtype("float16"))
        _604: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_26_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc606)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_k_proj_weight1)
        gv1040: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape360: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc606, gv1040, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc606)
        model_decoder_layers_26_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1123]
        model_decoder_layers_26_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1124]
        gv1041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc607: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1041, R.dtype("float16"))
        _605: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_26_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_26_encoder_attn_v_proj_bias1, alloc607)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_bias1)
        gv1042: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape361: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc607, gv1042, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc607)
        gv1043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape362: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape360, gv1043, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape360)
        gv1044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape363: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape361, gv1044, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape361)
        lv62: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv61, R.prim_value(26), reshape362, reshape363, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape362)
        R.vm.kill_object(reshape363)
        R.vm.kill_object(lv61)
        model_decoder_layers_27_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1146]
        gv1045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc608: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1045, R.dtype("float16"))
        _606: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_27_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc608)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_k_proj_weight1)
        gv1046: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape364: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc608, gv1046, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc608)
        model_decoder_layers_27_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1147]
        model_decoder_layers_27_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1148]
        gv1047: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc609: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1047, R.dtype("float16"))
        _607: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_27_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_27_encoder_attn_v_proj_bias1, alloc609)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_bias1)
        gv1048: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape365: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc609, gv1048, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc609)
        gv1049: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape366: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape364, gv1049, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape364)
        gv1050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape367: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape365, gv1050, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape365)
        lv63: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv62, R.prim_value(27), reshape366, reshape367, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape366)
        R.vm.kill_object(reshape367)
        R.vm.kill_object(lv62)
        model_decoder_layers_28_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1170]
        gv1051: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc610: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1051, R.dtype("float16"))
        _608: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_28_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc610)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_k_proj_weight1)
        gv1052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape368: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc610, gv1052, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc610)
        model_decoder_layers_28_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1171]
        model_decoder_layers_28_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1172]
        gv1053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc611: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1053, R.dtype("float16"))
        _609: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_28_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_28_encoder_attn_v_proj_bias1, alloc611)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_bias1)
        gv1054: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape369: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc611, gv1054, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc611)
        gv1055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape370: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape368, gv1055, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape368)
        gv1056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape371: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape369, gv1056, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape369)
        lv64: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv63, R.prim_value(28), reshape370, reshape371, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape370)
        R.vm.kill_object(reshape371)
        R.vm.kill_object(lv63)
        model_decoder_layers_29_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1194]
        gv1057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc612: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1057, R.dtype("float16"))
        _610: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_29_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc612)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_k_proj_weight1)
        gv1058: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape372: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc612, gv1058, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc612)
        model_decoder_layers_29_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1195]
        model_decoder_layers_29_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1196]
        gv1059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc613: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1059, R.dtype("float16"))
        _611: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_29_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_29_encoder_attn_v_proj_bias1, alloc613)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_bias1)
        gv1060: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape373: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc613, gv1060, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc613)
        gv1061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape374: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape372, gv1061, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape372)
        gv1062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape375: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape373, gv1062, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape373)
        lv65: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv64, R.prim_value(29), reshape374, reshape375, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape374)
        R.vm.kill_object(reshape375)
        R.vm.kill_object(lv64)
        model_decoder_layers_30_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1218]
        gv1063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc614: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1063, R.dtype("float16"))
        _612: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_30_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc614)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_k_proj_weight1)
        gv1064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape376: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc614, gv1064, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc614)
        model_decoder_layers_30_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1219]
        model_decoder_layers_30_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1220]
        gv1065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc615: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1065, R.dtype("float16"))
        _613: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_30_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_30_encoder_attn_v_proj_bias1, alloc615)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_bias1)
        gv1066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape377: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc615, gv1066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc615)
        gv1067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape378: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape376, gv1067, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape376)
        gv1068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape379: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape377, gv1068, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape377)
        lv66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv65, R.prim_value(30), reshape378, reshape379, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape378)
        R.vm.kill_object(reshape379)
        R.vm.kill_object(lv65)
        model_decoder_layers_31_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1242]
        gv1069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc616: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1069, R.dtype("float16"))
        R.vm.kill_object(storage11)
        _614: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_31_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc616)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_k_proj_weight1)
        gv1070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape380: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc616, gv1070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc616)
        model_decoder_layers_31_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1243]
        model_decoder_layers_31_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1244]
        gv1071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc617: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1071, R.dtype("float16"))
        R.vm.kill_object(storage12)
        _615: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_31_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_31_encoder_attn_v_proj_bias1, alloc617)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_weight1)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_bias1)
        gv1072: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape381: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc617, gv1072, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc617)
        gv1073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape382: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape380, gv1073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape380)
        gv1074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape383: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape381, gv1074, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape381)
        gv1: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv66, R.prim_value(31), reshape382, reshape383, sinfo_args=(R.Object,))
        R.vm.kill_object(reshape382)
        R.vm.kill_object(reshape383)
        R.vm.kill_object(lv66)
        return gv1

    @R.function
    def batch_decode(input_ids: R.Tensor(("batch_size", 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1, 51866), dtype="float32"):
        batch_size = T.int64()
        R.func_attr({"num_input": 2, "relax.force_pure": 1, "relax.rewrite_cuda_graph.capture_symbolic_vars": ["batch_size"], "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        model_decoder_embed_tokens_weight3: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        gv1075: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),))
        reshape707: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv1075, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),))
        model_decoder_embed_tokens_weight3_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        storage13: R.Object = R.vm.alloc_storage(R.shape([81920]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1076: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc618: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1076, R.dtype("float16"))
        cls.take(model_decoder_embed_tokens_weight3_1, reshape707, alloc618)
        R.vm.kill_object(reshape707)
        R.vm.kill_object(model_decoder_embed_tokens_weight3_1)
        gv1077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape708: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc618, gv1077, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc618)
        lv133: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),))
        model_decoder_embed_positions_weight3: R.Tensor((448, 1280), dtype="float16") = packed_params[488]
        storage14: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1078: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc619: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1078, R.dtype("float16"))
        cls.take1(model_decoder_embed_positions_weight3, lv133, alloc619)
        R.vm.kill_object(lv133)
        R.vm.kill_object(model_decoder_embed_positions_weight3)
        gv1079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape709: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc619, gv1079, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc619)
        storage15: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1080, R.dtype("float16"))
        cls.add(reshape708, reshape709, alloc620)
        R.vm.kill_object(reshape708)
        R.vm.kill_object(reshape709)
        model_decoder_layers_0_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[496]
        model_decoder_layers_0_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[497]
        gv1081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1081, R.dtype("float16"))
        cls.layer_norm(alloc620, model_decoder_layers_0_self_attn_layer_norm_weight3, model_decoder_layers_0_self_attn_layer_norm_bias3, alloc621)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias3)
        model_decoder_layers_0_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[492]
        model_decoder_layers_0_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[493]
        gv1082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1082, R.dtype("float16"))
        _620: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_q_proj_weight3, alloc621, model_decoder_layers_0_self_attn_q_proj_bias3, alloc622)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias3)
        gv1083: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape710: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc622, gv1083, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc622)
        model_decoder_layers_0_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[489]
        storage16: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1084, R.dtype("float16"))
        _621: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_0_self_attn_k_proj_weight3, alloc621, alloc623)
        R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight3)
        gv1085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape711: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc623, gv1085, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc623)
        model_decoder_layers_0_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[490]
        model_decoder_layers_0_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[491]
        storage17: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1086, R.dtype("float16"))
        _622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_v_proj_weight3, alloc621, model_decoder_layers_0_self_attn_v_proj_bias3, alloc624)
        R.vm.kill_object(alloc621)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias3)
        gv1087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape712: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc624, gv1087, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc624)
        gv1088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc625: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1088, R.dtype("float16"))
        cls.concatenate(reshape710, reshape711, reshape712, alloc625)
        R.vm.kill_object(reshape710)
        R.vm.kill_object(reshape711)
        R.vm.kill_object(reshape712)
        gv1089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape713: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc625, gv1089, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc625)
        gv1090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1090, R.dtype("float16"))
        _624: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape713, alloc626)
        R.vm.kill_object(reshape713)
        gv1091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape714: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc626, gv1091, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc626)
        gv1092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape715: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape714, gv1092, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape714)
        model_decoder_layers_0_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[494]
        model_decoder_layers_0_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[495]
        gv1093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1093, R.dtype("float16"))
        _625: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_out_proj_weight3, reshape715, model_decoder_layers_0_self_attn_out_proj_bias3, alloc627)
        R.vm.kill_object(reshape715)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias3)
        gv1094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1094, R.dtype("float16"))
        cls.add(alloc620, alloc627, alloc628)
        R.vm.kill_object(alloc620)
        R.vm.kill_object(alloc627)
        model_decoder_layers_0_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[505]
        model_decoder_layers_0_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[506]
        gv1095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1095, R.dtype("float16"))
        cls.layer_norm(alloc628, model_decoder_layers_0_encoder_attn_layer_norm_weight3, model_decoder_layers_0_encoder_attn_layer_norm_bias3, alloc629)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias3)
        model_decoder_layers_0_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[501]
        model_decoder_layers_0_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[502]
        gv1096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1096, R.dtype("float16"))
        _628: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight3, alloc629, model_decoder_layers_0_encoder_attn_q_proj_bias3, alloc630)
        R.vm.kill_object(alloc629)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias3)
        gv1097: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape716: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc630, gv1097, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc630)
        gv1098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape717: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape716, gv1098, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape716)
        gv1099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1099, R.dtype("float16"))
        _629: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape717, alloc631)
        R.vm.kill_object(reshape717)
        gv1100: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape718: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc631, gv1100, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc631)
        gv1101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape719: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape718, gv1101, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape718)
        model_decoder_layers_0_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[503]
        model_decoder_layers_0_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[504]
        gv1102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1102, R.dtype("float16"))
        _630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight3, reshape719, model_decoder_layers_0_encoder_attn_out_proj_bias3, alloc632)
        R.vm.kill_object(reshape719)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias3)
        gv1103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1103, R.dtype("float16"))
        cls.add(alloc628, alloc632, alloc633)
        R.vm.kill_object(alloc628)
        R.vm.kill_object(alloc632)
        model_decoder_layers_0_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[511]
        model_decoder_layers_0_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[512]
        gv1104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1104, R.dtype("float16"))
        cls.layer_norm(alloc633, model_decoder_layers_0_final_layer_norm_weight3, model_decoder_layers_0_final_layer_norm_bias3, alloc634)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias3)
        model_decoder_layers_0_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[507]
        model_decoder_layers_0_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[508]
        gv1105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1105, R.dtype("float16"))
        _633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_0_fc1_weight3, alloc634, model_decoder_layers_0_fc1_bias3, alloc635)
        R.vm.kill_object(alloc634)
        R.vm.kill_object(model_decoder_layers_0_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_0_fc1_bias3)
        model_decoder_layers_0_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[509]
        model_decoder_layers_0_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[510]
        gv1106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1106, R.dtype("float16"))
        _634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_0_fc2_weight3, alloc635, model_decoder_layers_0_fc2_bias3, alloc636)
        R.vm.kill_object(alloc635)
        R.vm.kill_object(model_decoder_layers_0_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_0_fc2_bias3)
        gv1107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1107, R.dtype("float16"))
        cls.add(alloc633, alloc636, alloc637)
        R.vm.kill_object(alloc633)
        R.vm.kill_object(alloc636)
        model_decoder_layers_1_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[520]
        model_decoder_layers_1_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[521]
        gv1108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1108, R.dtype("float16"))
        cls.layer_norm(alloc637, model_decoder_layers_1_self_attn_layer_norm_weight3, model_decoder_layers_1_self_attn_layer_norm_bias3, alloc638)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias3)
        model_decoder_layers_1_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[516]
        model_decoder_layers_1_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[517]
        gv1109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1109, R.dtype("float16"))
        _637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_q_proj_weight3, alloc638, model_decoder_layers_1_self_attn_q_proj_bias3, alloc639)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias3)
        gv1110: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape720: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc639, gv1110, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc639)
        model_decoder_layers_1_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[513]
        gv1111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1111, R.dtype("float16"))
        _638: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_1_self_attn_k_proj_weight3, alloc638, alloc640)
        R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight3)
        gv1112: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape721: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc640, gv1112, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc640)
        model_decoder_layers_1_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[514]
        model_decoder_layers_1_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[515]
        gv1113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1113, R.dtype("float16"))
        _639: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_v_proj_weight3, alloc638, model_decoder_layers_1_self_attn_v_proj_bias3, alloc641)
        R.vm.kill_object(alloc638)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias3)
        gv1114: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape722: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc641, gv1114, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc641)
        gv1115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc642: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1115, R.dtype("float16"))
        cls.concatenate(reshape720, reshape721, reshape722, alloc642)
        R.vm.kill_object(reshape720)
        R.vm.kill_object(reshape721)
        R.vm.kill_object(reshape722)
        gv1116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape723: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc642, gv1116, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc642)
        gv1117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1117, R.dtype("float16"))
        _641: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape723, alloc643)
        R.vm.kill_object(reshape723)
        gv1118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape724: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc643, gv1118, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc643)
        gv1119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape725: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape724, gv1119, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape724)
        model_decoder_layers_1_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[518]
        model_decoder_layers_1_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[519]
        gv1120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1120, R.dtype("float16"))
        _642: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_out_proj_weight3, reshape725, model_decoder_layers_1_self_attn_out_proj_bias3, alloc644)
        R.vm.kill_object(reshape725)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias3)
        gv1121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1121, R.dtype("float16"))
        cls.add(alloc637, alloc644, alloc645)
        R.vm.kill_object(alloc637)
        R.vm.kill_object(alloc644)
        model_decoder_layers_1_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[529]
        model_decoder_layers_1_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[530]
        gv1122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1122, R.dtype("float16"))
        cls.layer_norm(alloc645, model_decoder_layers_1_encoder_attn_layer_norm_weight3, model_decoder_layers_1_encoder_attn_layer_norm_bias3, alloc646)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias3)
        model_decoder_layers_1_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[525]
        model_decoder_layers_1_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[526]
        gv1123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1123, R.dtype("float16"))
        _645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight3, alloc646, model_decoder_layers_1_encoder_attn_q_proj_bias3, alloc647)
        R.vm.kill_object(alloc646)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias3)
        gv1124: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape726: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc647, gv1124, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc647)
        gv1125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape727: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape726, gv1125, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape726)
        gv1126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1126, R.dtype("float16"))
        _646: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape727, alloc648)
        R.vm.kill_object(reshape727)
        gv1127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape728: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc648, gv1127, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc648)
        gv1128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape729: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape728, gv1128, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape728)
        model_decoder_layers_1_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[527]
        model_decoder_layers_1_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[528]
        gv1129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1129, R.dtype("float16"))
        _647: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight3, reshape729, model_decoder_layers_1_encoder_attn_out_proj_bias3, alloc649)
        R.vm.kill_object(reshape729)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias3)
        gv1130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1130, R.dtype("float16"))
        cls.add(alloc645, alloc649, alloc650)
        R.vm.kill_object(alloc645)
        R.vm.kill_object(alloc649)
        model_decoder_layers_1_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[535]
        model_decoder_layers_1_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[536]
        gv1131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1131, R.dtype("float16"))
        cls.layer_norm(alloc650, model_decoder_layers_1_final_layer_norm_weight3, model_decoder_layers_1_final_layer_norm_bias3, alloc651)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias3)
        model_decoder_layers_1_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[531]
        model_decoder_layers_1_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[532]
        gv1132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1132, R.dtype("float16"))
        _650: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_1_fc1_weight3, alloc651, model_decoder_layers_1_fc1_bias3, alloc652)
        R.vm.kill_object(alloc651)
        R.vm.kill_object(model_decoder_layers_1_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_1_fc1_bias3)
        model_decoder_layers_1_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[533]
        model_decoder_layers_1_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[534]
        gv1133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1133, R.dtype("float16"))
        _651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_1_fc2_weight3, alloc652, model_decoder_layers_1_fc2_bias3, alloc653)
        R.vm.kill_object(alloc652)
        R.vm.kill_object(model_decoder_layers_1_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_1_fc2_bias3)
        gv1134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1134, R.dtype("float16"))
        cls.add(alloc650, alloc653, alloc654)
        R.vm.kill_object(alloc650)
        R.vm.kill_object(alloc653)
        model_decoder_layers_2_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[544]
        model_decoder_layers_2_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[545]
        gv1135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1135, R.dtype("float16"))
        cls.layer_norm(alloc654, model_decoder_layers_2_self_attn_layer_norm_weight3, model_decoder_layers_2_self_attn_layer_norm_bias3, alloc655)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias3)
        model_decoder_layers_2_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[540]
        model_decoder_layers_2_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[541]
        gv1136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1136, R.dtype("float16"))
        _654: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_q_proj_weight3, alloc655, model_decoder_layers_2_self_attn_q_proj_bias3, alloc656)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias3)
        gv1137: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape730: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc656, gv1137, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc656)
        model_decoder_layers_2_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[537]
        gv1138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1138, R.dtype("float16"))
        _655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_2_self_attn_k_proj_weight3, alloc655, alloc657)
        R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight3)
        gv1139: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape731: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc657, gv1139, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc657)
        model_decoder_layers_2_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[538]
        model_decoder_layers_2_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[539]
        gv1140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1140, R.dtype("float16"))
        _656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_v_proj_weight3, alloc655, model_decoder_layers_2_self_attn_v_proj_bias3, alloc658)
        R.vm.kill_object(alloc655)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias3)
        gv1141: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape732: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc658, gv1141, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc658)
        gv1142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc659: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1142, R.dtype("float16"))
        cls.concatenate(reshape730, reshape731, reshape732, alloc659)
        R.vm.kill_object(reshape730)
        R.vm.kill_object(reshape731)
        R.vm.kill_object(reshape732)
        gv1143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape733: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc659, gv1143, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc659)
        gv1144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1144, R.dtype("float16"))
        _658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape733, alloc660)
        R.vm.kill_object(reshape733)
        gv1145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape734: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc660, gv1145, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc660)
        gv1146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape735: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape734, gv1146, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape734)
        model_decoder_layers_2_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[542]
        model_decoder_layers_2_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[543]
        gv1147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1147, R.dtype("float16"))
        _659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_out_proj_weight3, reshape735, model_decoder_layers_2_self_attn_out_proj_bias3, alloc661)
        R.vm.kill_object(reshape735)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias3)
        gv1148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1148, R.dtype("float16"))
        cls.add(alloc654, alloc661, alloc662)
        R.vm.kill_object(alloc654)
        R.vm.kill_object(alloc661)
        model_decoder_layers_2_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[553]
        model_decoder_layers_2_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[554]
        gv1149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1149, R.dtype("float16"))
        cls.layer_norm(alloc662, model_decoder_layers_2_encoder_attn_layer_norm_weight3, model_decoder_layers_2_encoder_attn_layer_norm_bias3, alloc663)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias3)
        model_decoder_layers_2_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[549]
        model_decoder_layers_2_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[550]
        gv1150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1150, R.dtype("float16"))
        _662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight3, alloc663, model_decoder_layers_2_encoder_attn_q_proj_bias3, alloc664)
        R.vm.kill_object(alloc663)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias3)
        gv1151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape736: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc664, gv1151, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc664)
        gv1152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape737: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape736, gv1152, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape736)
        gv1153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1153, R.dtype("float16"))
        _663: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape737, alloc665)
        R.vm.kill_object(reshape737)
        gv1154: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape738: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc665, gv1154, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc665)
        gv1155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape739: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape738, gv1155, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape738)
        model_decoder_layers_2_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[551]
        model_decoder_layers_2_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[552]
        gv1156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1156, R.dtype("float16"))
        _664: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight3, reshape739, model_decoder_layers_2_encoder_attn_out_proj_bias3, alloc666)
        R.vm.kill_object(reshape739)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias3)
        gv1157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1157, R.dtype("float16"))
        cls.add(alloc662, alloc666, alloc667)
        R.vm.kill_object(alloc662)
        R.vm.kill_object(alloc666)
        model_decoder_layers_2_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[559]
        model_decoder_layers_2_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[560]
        gv1158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1158, R.dtype("float16"))
        cls.layer_norm(alloc667, model_decoder_layers_2_final_layer_norm_weight3, model_decoder_layers_2_final_layer_norm_bias3, alloc668)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias3)
        model_decoder_layers_2_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[555]
        model_decoder_layers_2_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[556]
        gv1159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1159, R.dtype("float16"))
        _667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_2_fc1_weight3, alloc668, model_decoder_layers_2_fc1_bias3, alloc669)
        R.vm.kill_object(alloc668)
        R.vm.kill_object(model_decoder_layers_2_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_2_fc1_bias3)
        model_decoder_layers_2_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[557]
        model_decoder_layers_2_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[558]
        gv1160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1160, R.dtype("float16"))
        _668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_2_fc2_weight3, alloc669, model_decoder_layers_2_fc2_bias3, alloc670)
        R.vm.kill_object(alloc669)
        R.vm.kill_object(model_decoder_layers_2_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_2_fc2_bias3)
        gv1161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1161, R.dtype("float16"))
        cls.add(alloc667, alloc670, alloc671)
        R.vm.kill_object(alloc667)
        R.vm.kill_object(alloc670)
        model_decoder_layers_3_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[568]
        model_decoder_layers_3_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[569]
        gv1162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1162, R.dtype("float16"))
        cls.layer_norm(alloc671, model_decoder_layers_3_self_attn_layer_norm_weight3, model_decoder_layers_3_self_attn_layer_norm_bias3, alloc672)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias3)
        model_decoder_layers_3_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[564]
        model_decoder_layers_3_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[565]
        gv1163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1163, R.dtype("float16"))
        _671: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_q_proj_weight3, alloc672, model_decoder_layers_3_self_attn_q_proj_bias3, alloc673)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias3)
        gv1164: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape740: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc673, gv1164, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc673)
        model_decoder_layers_3_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[561]
        gv1165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1165, R.dtype("float16"))
        _672: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_3_self_attn_k_proj_weight3, alloc672, alloc674)
        R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight3)
        gv1166: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape741: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc674, gv1166, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc674)
        model_decoder_layers_3_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[562]
        model_decoder_layers_3_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[563]
        gv1167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1167, R.dtype("float16"))
        _673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_v_proj_weight3, alloc672, model_decoder_layers_3_self_attn_v_proj_bias3, alloc675)
        R.vm.kill_object(alloc672)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias3)
        gv1168: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape742: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc675, gv1168, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc675)
        gv1169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc676: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1169, R.dtype("float16"))
        cls.concatenate(reshape740, reshape741, reshape742, alloc676)
        R.vm.kill_object(reshape740)
        R.vm.kill_object(reshape741)
        R.vm.kill_object(reshape742)
        gv1170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape743: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc676, gv1170, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc676)
        gv1171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1171, R.dtype("float16"))
        _675: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape743, alloc677)
        R.vm.kill_object(reshape743)
        gv1172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape744: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc677, gv1172, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc677)
        gv1173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape745: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape744, gv1173, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape744)
        model_decoder_layers_3_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[566]
        model_decoder_layers_3_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[567]
        gv1174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1174, R.dtype("float16"))
        _676: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_out_proj_weight3, reshape745, model_decoder_layers_3_self_attn_out_proj_bias3, alloc678)
        R.vm.kill_object(reshape745)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias3)
        gv1175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1175, R.dtype("float16"))
        cls.add(alloc671, alloc678, alloc679)
        R.vm.kill_object(alloc671)
        R.vm.kill_object(alloc678)
        model_decoder_layers_3_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[577]
        model_decoder_layers_3_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[578]
        gv1176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1176, R.dtype("float16"))
        cls.layer_norm(alloc679, model_decoder_layers_3_encoder_attn_layer_norm_weight3, model_decoder_layers_3_encoder_attn_layer_norm_bias3, alloc680)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias3)
        model_decoder_layers_3_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[573]
        model_decoder_layers_3_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[574]
        gv1177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1177, R.dtype("float16"))
        _679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight3, alloc680, model_decoder_layers_3_encoder_attn_q_proj_bias3, alloc681)
        R.vm.kill_object(alloc680)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias3)
        gv1178: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape746: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc681, gv1178, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc681)
        gv1179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape747: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape746, gv1179, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape746)
        gv1180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1180, R.dtype("float16"))
        _680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape747, alloc682)
        R.vm.kill_object(reshape747)
        gv1181: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape748: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc682, gv1181, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc682)
        gv1182: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape749: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape748, gv1182, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape748)
        model_decoder_layers_3_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[575]
        model_decoder_layers_3_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[576]
        gv1183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1183, R.dtype("float16"))
        _681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight3, reshape749, model_decoder_layers_3_encoder_attn_out_proj_bias3, alloc683)
        R.vm.kill_object(reshape749)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias3)
        gv1184: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1184, R.dtype("float16"))
        cls.add(alloc679, alloc683, alloc684)
        R.vm.kill_object(alloc679)
        R.vm.kill_object(alloc683)
        model_decoder_layers_3_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[583]
        model_decoder_layers_3_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[584]
        gv1185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1185, R.dtype("float16"))
        cls.layer_norm(alloc684, model_decoder_layers_3_final_layer_norm_weight3, model_decoder_layers_3_final_layer_norm_bias3, alloc685)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias3)
        model_decoder_layers_3_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[579]
        model_decoder_layers_3_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[580]
        gv1186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1186, R.dtype("float16"))
        _684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_3_fc1_weight3, alloc685, model_decoder_layers_3_fc1_bias3, alloc686)
        R.vm.kill_object(alloc685)
        R.vm.kill_object(model_decoder_layers_3_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_3_fc1_bias3)
        model_decoder_layers_3_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[581]
        model_decoder_layers_3_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[582]
        gv1187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1187, R.dtype("float16"))
        _685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_3_fc2_weight3, alloc686, model_decoder_layers_3_fc2_bias3, alloc687)
        R.vm.kill_object(alloc686)
        R.vm.kill_object(model_decoder_layers_3_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_3_fc2_bias3)
        gv1188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1188, R.dtype("float16"))
        cls.add(alloc684, alloc687, alloc688)
        R.vm.kill_object(alloc684)
        R.vm.kill_object(alloc687)
        model_decoder_layers_4_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[592]
        model_decoder_layers_4_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[593]
        gv1189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1189, R.dtype("float16"))
        cls.layer_norm(alloc688, model_decoder_layers_4_self_attn_layer_norm_weight3, model_decoder_layers_4_self_attn_layer_norm_bias3, alloc689)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias3)
        model_decoder_layers_4_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[588]
        model_decoder_layers_4_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[589]
        gv1190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1190, R.dtype("float16"))
        _688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_q_proj_weight3, alloc689, model_decoder_layers_4_self_attn_q_proj_bias3, alloc690)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias3)
        gv1191: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape750: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc690, gv1191, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc690)
        model_decoder_layers_4_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[585]
        gv1192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1192, R.dtype("float16"))
        _689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_4_self_attn_k_proj_weight3, alloc689, alloc691)
        R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight3)
        gv1193: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape751: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc691, gv1193, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc691)
        model_decoder_layers_4_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[586]
        model_decoder_layers_4_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[587]
        gv1194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1194, R.dtype("float16"))
        _690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_v_proj_weight3, alloc689, model_decoder_layers_4_self_attn_v_proj_bias3, alloc692)
        R.vm.kill_object(alloc689)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias3)
        gv1195: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape752: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc692, gv1195, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc692)
        gv1196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc693: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1196, R.dtype("float16"))
        cls.concatenate(reshape750, reshape751, reshape752, alloc693)
        R.vm.kill_object(reshape750)
        R.vm.kill_object(reshape751)
        R.vm.kill_object(reshape752)
        gv1197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape753: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc693, gv1197, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc693)
        gv1198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1198, R.dtype("float16"))
        _692: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape753, alloc694)
        R.vm.kill_object(reshape753)
        gv1199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape754: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc694, gv1199, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc694)
        gv1200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape755: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape754, gv1200, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape754)
        model_decoder_layers_4_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[590]
        model_decoder_layers_4_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[591]
        gv1201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1201, R.dtype("float16"))
        _693: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_out_proj_weight3, reshape755, model_decoder_layers_4_self_attn_out_proj_bias3, alloc695)
        R.vm.kill_object(reshape755)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias3)
        gv1202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1202, R.dtype("float16"))
        cls.add(alloc688, alloc695, alloc696)
        R.vm.kill_object(alloc688)
        R.vm.kill_object(alloc695)
        model_decoder_layers_4_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[601]
        model_decoder_layers_4_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[602]
        gv1203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1203, R.dtype("float16"))
        cls.layer_norm(alloc696, model_decoder_layers_4_encoder_attn_layer_norm_weight3, model_decoder_layers_4_encoder_attn_layer_norm_bias3, alloc697)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias3)
        model_decoder_layers_4_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[597]
        model_decoder_layers_4_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[598]
        gv1204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1204, R.dtype("float16"))
        _696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight3, alloc697, model_decoder_layers_4_encoder_attn_q_proj_bias3, alloc698)
        R.vm.kill_object(alloc697)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias3)
        gv1205: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape756: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc698, gv1205, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc698)
        gv1206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape757: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape756, gv1206, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape756)
        gv1207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1207, R.dtype("float16"))
        _697: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape757, alloc699)
        R.vm.kill_object(reshape757)
        gv1208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape758: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc699, gv1208, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc699)
        gv1209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape759: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape758, gv1209, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape758)
        model_decoder_layers_4_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[599]
        model_decoder_layers_4_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[600]
        gv1210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1210, R.dtype("float16"))
        _698: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight3, reshape759, model_decoder_layers_4_encoder_attn_out_proj_bias3, alloc700)
        R.vm.kill_object(reshape759)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias3)
        gv1211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1211, R.dtype("float16"))
        cls.add(alloc696, alloc700, alloc701)
        R.vm.kill_object(alloc696)
        R.vm.kill_object(alloc700)
        model_decoder_layers_4_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[607]
        model_decoder_layers_4_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[608]
        gv1212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1212, R.dtype("float16"))
        cls.layer_norm(alloc701, model_decoder_layers_4_final_layer_norm_weight3, model_decoder_layers_4_final_layer_norm_bias3, alloc702)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias3)
        model_decoder_layers_4_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[603]
        model_decoder_layers_4_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[604]
        gv1213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1213, R.dtype("float16"))
        _701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_4_fc1_weight3, alloc702, model_decoder_layers_4_fc1_bias3, alloc703)
        R.vm.kill_object(alloc702)
        R.vm.kill_object(model_decoder_layers_4_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_4_fc1_bias3)
        model_decoder_layers_4_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[605]
        model_decoder_layers_4_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[606]
        gv1214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1214, R.dtype("float16"))
        _702: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_4_fc2_weight3, alloc703, model_decoder_layers_4_fc2_bias3, alloc704)
        R.vm.kill_object(alloc703)
        R.vm.kill_object(model_decoder_layers_4_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_4_fc2_bias3)
        gv1215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1215, R.dtype("float16"))
        cls.add(alloc701, alloc704, alloc705)
        R.vm.kill_object(alloc701)
        R.vm.kill_object(alloc704)
        model_decoder_layers_5_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[616]
        model_decoder_layers_5_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[617]
        gv1216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1216, R.dtype("float16"))
        cls.layer_norm(alloc705, model_decoder_layers_5_self_attn_layer_norm_weight3, model_decoder_layers_5_self_attn_layer_norm_bias3, alloc706)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias3)
        model_decoder_layers_5_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[612]
        model_decoder_layers_5_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[613]
        gv1217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1217, R.dtype("float16"))
        _705: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_q_proj_weight3, alloc706, model_decoder_layers_5_self_attn_q_proj_bias3, alloc707)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias3)
        gv1218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape760: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc707, gv1218, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc707)
        model_decoder_layers_5_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[609]
        gv1219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1219, R.dtype("float16"))
        _706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_5_self_attn_k_proj_weight3, alloc706, alloc708)
        R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight3)
        gv1220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape761: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc708, gv1220, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc708)
        model_decoder_layers_5_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[610]
        model_decoder_layers_5_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[611]
        gv1221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1221, R.dtype("float16"))
        _707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_v_proj_weight3, alloc706, model_decoder_layers_5_self_attn_v_proj_bias3, alloc709)
        R.vm.kill_object(alloc706)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias3)
        gv1222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape762: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc709, gv1222, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc709)
        gv1223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc710: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1223, R.dtype("float16"))
        cls.concatenate(reshape760, reshape761, reshape762, alloc710)
        R.vm.kill_object(reshape760)
        R.vm.kill_object(reshape761)
        R.vm.kill_object(reshape762)
        gv1224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape763: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc710, gv1224, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc710)
        gv1225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1225, R.dtype("float16"))
        _709: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape763, alloc711)
        R.vm.kill_object(reshape763)
        gv1226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape764: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc711, gv1226, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc711)
        gv1227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape765: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape764, gv1227, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape764)
        model_decoder_layers_5_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[614]
        model_decoder_layers_5_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[615]
        gv1228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1228, R.dtype("float16"))
        _710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_out_proj_weight3, reshape765, model_decoder_layers_5_self_attn_out_proj_bias3, alloc712)
        R.vm.kill_object(reshape765)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias3)
        gv1229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1229, R.dtype("float16"))
        cls.add(alloc705, alloc712, alloc713)
        R.vm.kill_object(alloc705)
        R.vm.kill_object(alloc712)
        model_decoder_layers_5_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[625]
        model_decoder_layers_5_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[626]
        gv1230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1230, R.dtype("float16"))
        cls.layer_norm(alloc713, model_decoder_layers_5_encoder_attn_layer_norm_weight3, model_decoder_layers_5_encoder_attn_layer_norm_bias3, alloc714)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias3)
        model_decoder_layers_5_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[621]
        model_decoder_layers_5_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[622]
        gv1231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1231, R.dtype("float16"))
        _713: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight3, alloc714, model_decoder_layers_5_encoder_attn_q_proj_bias3, alloc715)
        R.vm.kill_object(alloc714)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias3)
        gv1232: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape766: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc715, gv1232, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc715)
        gv1233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape767: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape766, gv1233, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape766)
        gv1234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1234, R.dtype("float16"))
        _714: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape767, alloc716)
        R.vm.kill_object(reshape767)
        gv1235: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape768: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc716, gv1235, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc716)
        gv1236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape769: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape768, gv1236, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape768)
        model_decoder_layers_5_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[623]
        model_decoder_layers_5_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[624]
        gv1237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1237, R.dtype("float16"))
        _715: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight3, reshape769, model_decoder_layers_5_encoder_attn_out_proj_bias3, alloc717)
        R.vm.kill_object(reshape769)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias3)
        gv1238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1238, R.dtype("float16"))
        cls.add(alloc713, alloc717, alloc718)
        R.vm.kill_object(alloc713)
        R.vm.kill_object(alloc717)
        model_decoder_layers_5_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[631]
        model_decoder_layers_5_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[632]
        gv1239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1239, R.dtype("float16"))
        cls.layer_norm(alloc718, model_decoder_layers_5_final_layer_norm_weight3, model_decoder_layers_5_final_layer_norm_bias3, alloc719)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias3)
        model_decoder_layers_5_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[627]
        model_decoder_layers_5_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[628]
        gv1240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1240, R.dtype("float16"))
        _718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_5_fc1_weight3, alloc719, model_decoder_layers_5_fc1_bias3, alloc720)
        R.vm.kill_object(alloc719)
        R.vm.kill_object(model_decoder_layers_5_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_5_fc1_bias3)
        model_decoder_layers_5_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[629]
        model_decoder_layers_5_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[630]
        gv1241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1241, R.dtype("float16"))
        _719: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_5_fc2_weight3, alloc720, model_decoder_layers_5_fc2_bias3, alloc721)
        R.vm.kill_object(alloc720)
        R.vm.kill_object(model_decoder_layers_5_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_5_fc2_bias3)
        gv1242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1242, R.dtype("float16"))
        cls.add(alloc718, alloc721, alloc722)
        R.vm.kill_object(alloc718)
        R.vm.kill_object(alloc721)
        model_decoder_layers_6_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[640]
        model_decoder_layers_6_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[641]
        gv1243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1243, R.dtype("float16"))
        cls.layer_norm(alloc722, model_decoder_layers_6_self_attn_layer_norm_weight3, model_decoder_layers_6_self_attn_layer_norm_bias3, alloc723)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias3)
        model_decoder_layers_6_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[636]
        model_decoder_layers_6_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[637]
        gv1244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1244, R.dtype("float16"))
        _722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_q_proj_weight3, alloc723, model_decoder_layers_6_self_attn_q_proj_bias3, alloc724)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias3)
        gv1245: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape770: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc724, gv1245, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc724)
        model_decoder_layers_6_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[633]
        gv1246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1246, R.dtype("float16"))
        _723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_6_self_attn_k_proj_weight3, alloc723, alloc725)
        R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight3)
        gv1247: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape771: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc725, gv1247, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc725)
        model_decoder_layers_6_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[634]
        model_decoder_layers_6_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[635]
        gv1248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1248, R.dtype("float16"))
        _724: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_v_proj_weight3, alloc723, model_decoder_layers_6_self_attn_v_proj_bias3, alloc726)
        R.vm.kill_object(alloc723)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias3)
        gv1249: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape772: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc726, gv1249, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc726)
        gv1250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc727: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1250, R.dtype("float16"))
        cls.concatenate(reshape770, reshape771, reshape772, alloc727)
        R.vm.kill_object(reshape770)
        R.vm.kill_object(reshape771)
        R.vm.kill_object(reshape772)
        gv1251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape773: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc727, gv1251, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc727)
        gv1252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1252, R.dtype("float16"))
        _726: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape773, alloc728)
        R.vm.kill_object(reshape773)
        gv1253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape774: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc728, gv1253, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc728)
        gv1254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape775: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape774, gv1254, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape774)
        model_decoder_layers_6_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[638]
        model_decoder_layers_6_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[639]
        gv1255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1255, R.dtype("float16"))
        _727: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_out_proj_weight3, reshape775, model_decoder_layers_6_self_attn_out_proj_bias3, alloc729)
        R.vm.kill_object(reshape775)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias3)
        gv1256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1256, R.dtype("float16"))
        cls.add(alloc722, alloc729, alloc730)
        R.vm.kill_object(alloc722)
        R.vm.kill_object(alloc729)
        model_decoder_layers_6_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[649]
        model_decoder_layers_6_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[650]
        gv1257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1257, R.dtype("float16"))
        cls.layer_norm(alloc730, model_decoder_layers_6_encoder_attn_layer_norm_weight3, model_decoder_layers_6_encoder_attn_layer_norm_bias3, alloc731)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias3)
        model_decoder_layers_6_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[645]
        model_decoder_layers_6_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[646]
        gv1258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1258, R.dtype("float16"))
        _730: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight3, alloc731, model_decoder_layers_6_encoder_attn_q_proj_bias3, alloc732)
        R.vm.kill_object(alloc731)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias3)
        gv1259: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape776: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc732, gv1259, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc732)
        gv1260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape777: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape776, gv1260, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape776)
        gv1261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1261, R.dtype("float16"))
        _731: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape777, alloc733)
        R.vm.kill_object(reshape777)
        gv1262: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape778: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc733, gv1262, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc733)
        gv1263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape779: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape778, gv1263, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape778)
        model_decoder_layers_6_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[647]
        model_decoder_layers_6_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[648]
        gv1264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1264, R.dtype("float16"))
        _732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight3, reshape779, model_decoder_layers_6_encoder_attn_out_proj_bias3, alloc734)
        R.vm.kill_object(reshape779)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias3)
        gv1265: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1265, R.dtype("float16"))
        cls.add(alloc730, alloc734, alloc735)
        R.vm.kill_object(alloc730)
        R.vm.kill_object(alloc734)
        model_decoder_layers_6_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[655]
        model_decoder_layers_6_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[656]
        gv1266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1266, R.dtype("float16"))
        cls.layer_norm(alloc735, model_decoder_layers_6_final_layer_norm_weight3, model_decoder_layers_6_final_layer_norm_bias3, alloc736)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias3)
        model_decoder_layers_6_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[651]
        model_decoder_layers_6_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[652]
        gv1267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1267, R.dtype("float16"))
        _735: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_6_fc1_weight3, alloc736, model_decoder_layers_6_fc1_bias3, alloc737)
        R.vm.kill_object(alloc736)
        R.vm.kill_object(model_decoder_layers_6_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_6_fc1_bias3)
        model_decoder_layers_6_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[653]
        model_decoder_layers_6_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[654]
        gv1268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1268, R.dtype("float16"))
        _736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_6_fc2_weight3, alloc737, model_decoder_layers_6_fc2_bias3, alloc738)
        R.vm.kill_object(alloc737)
        R.vm.kill_object(model_decoder_layers_6_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_6_fc2_bias3)
        gv1269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1269, R.dtype("float16"))
        cls.add(alloc735, alloc738, alloc739)
        R.vm.kill_object(alloc735)
        R.vm.kill_object(alloc738)
        model_decoder_layers_7_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[664]
        model_decoder_layers_7_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[665]
        gv1270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1270, R.dtype("float16"))
        cls.layer_norm(alloc739, model_decoder_layers_7_self_attn_layer_norm_weight3, model_decoder_layers_7_self_attn_layer_norm_bias3, alloc740)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias3)
        model_decoder_layers_7_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[660]
        model_decoder_layers_7_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[661]
        gv1271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1271, R.dtype("float16"))
        _739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_q_proj_weight3, alloc740, model_decoder_layers_7_self_attn_q_proj_bias3, alloc741)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias3)
        gv1272: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape780: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc741, gv1272, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc741)
        model_decoder_layers_7_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[657]
        gv1273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1273, R.dtype("float16"))
        _740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_7_self_attn_k_proj_weight3, alloc740, alloc742)
        R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight3)
        gv1274: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape781: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc742, gv1274, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc742)
        model_decoder_layers_7_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[658]
        model_decoder_layers_7_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[659]
        gv1275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1275, R.dtype("float16"))
        _741: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_v_proj_weight3, alloc740, model_decoder_layers_7_self_attn_v_proj_bias3, alloc743)
        R.vm.kill_object(alloc740)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias3)
        gv1276: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape782: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc743, gv1276, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc743)
        gv1277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc744: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1277, R.dtype("float16"))
        cls.concatenate(reshape780, reshape781, reshape782, alloc744)
        R.vm.kill_object(reshape780)
        R.vm.kill_object(reshape781)
        R.vm.kill_object(reshape782)
        gv1278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape783: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc744, gv1278, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc744)
        gv1279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1279, R.dtype("float16"))
        _743: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape783, alloc745)
        R.vm.kill_object(reshape783)
        gv1280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape784: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc745, gv1280, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc745)
        gv1281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape785: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape784, gv1281, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape784)
        model_decoder_layers_7_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[662]
        model_decoder_layers_7_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[663]
        gv1282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1282, R.dtype("float16"))
        _744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_out_proj_weight3, reshape785, model_decoder_layers_7_self_attn_out_proj_bias3, alloc746)
        R.vm.kill_object(reshape785)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias3)
        gv1283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1283, R.dtype("float16"))
        cls.add(alloc739, alloc746, alloc747)
        R.vm.kill_object(alloc739)
        R.vm.kill_object(alloc746)
        model_decoder_layers_7_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[673]
        model_decoder_layers_7_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[674]
        gv1284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1284, R.dtype("float16"))
        cls.layer_norm(alloc747, model_decoder_layers_7_encoder_attn_layer_norm_weight3, model_decoder_layers_7_encoder_attn_layer_norm_bias3, alloc748)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias3)
        model_decoder_layers_7_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[669]
        model_decoder_layers_7_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[670]
        gv1285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1285, R.dtype("float16"))
        _747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight3, alloc748, model_decoder_layers_7_encoder_attn_q_proj_bias3, alloc749)
        R.vm.kill_object(alloc748)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias3)
        gv1286: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape786: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc749, gv1286, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc749)
        gv1287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape787: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape786, gv1287, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape786)
        gv1288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1288, R.dtype("float16"))
        _748: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape787, alloc750)
        R.vm.kill_object(reshape787)
        gv1289: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape788: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc750, gv1289, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc750)
        gv1290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape789: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape788, gv1290, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape788)
        model_decoder_layers_7_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[671]
        model_decoder_layers_7_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[672]
        gv1291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1291, R.dtype("float16"))
        _749: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight3, reshape789, model_decoder_layers_7_encoder_attn_out_proj_bias3, alloc751)
        R.vm.kill_object(reshape789)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias3)
        gv1292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1292, R.dtype("float16"))
        cls.add(alloc747, alloc751, alloc752)
        R.vm.kill_object(alloc747)
        R.vm.kill_object(alloc751)
        model_decoder_layers_7_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[679]
        model_decoder_layers_7_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[680]
        gv1293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1293, R.dtype("float16"))
        cls.layer_norm(alloc752, model_decoder_layers_7_final_layer_norm_weight3, model_decoder_layers_7_final_layer_norm_bias3, alloc753)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias3)
        model_decoder_layers_7_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[675]
        model_decoder_layers_7_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[676]
        gv1294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1294, R.dtype("float16"))
        _752: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_7_fc1_weight3, alloc753, model_decoder_layers_7_fc1_bias3, alloc754)
        R.vm.kill_object(alloc753)
        R.vm.kill_object(model_decoder_layers_7_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_7_fc1_bias3)
        model_decoder_layers_7_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[677]
        model_decoder_layers_7_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[678]
        gv1295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1295, R.dtype("float16"))
        _753: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_7_fc2_weight3, alloc754, model_decoder_layers_7_fc2_bias3, alloc755)
        R.vm.kill_object(alloc754)
        R.vm.kill_object(model_decoder_layers_7_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_7_fc2_bias3)
        gv1296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1296, R.dtype("float16"))
        cls.add(alloc752, alloc755, alloc756)
        R.vm.kill_object(alloc752)
        R.vm.kill_object(alloc755)
        model_decoder_layers_8_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[688]
        model_decoder_layers_8_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[689]
        gv1297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1297, R.dtype("float16"))
        cls.layer_norm(alloc756, model_decoder_layers_8_self_attn_layer_norm_weight3, model_decoder_layers_8_self_attn_layer_norm_bias3, alloc757)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias3)
        model_decoder_layers_8_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[684]
        model_decoder_layers_8_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[685]
        gv1298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1298, R.dtype("float16"))
        _756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_q_proj_weight3, alloc757, model_decoder_layers_8_self_attn_q_proj_bias3, alloc758)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias3)
        gv1299: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape790: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc758, gv1299, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc758)
        model_decoder_layers_8_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[681]
        gv1300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1300, R.dtype("float16"))
        _757: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_8_self_attn_k_proj_weight3, alloc757, alloc759)
        R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight3)
        gv1301: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape791: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc759, gv1301, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc759)
        model_decoder_layers_8_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[682]
        model_decoder_layers_8_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[683]
        gv1302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1302, R.dtype("float16"))
        _758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_v_proj_weight3, alloc757, model_decoder_layers_8_self_attn_v_proj_bias3, alloc760)
        R.vm.kill_object(alloc757)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias3)
        gv1303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape792: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc760, gv1303, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc760)
        gv1304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc761: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1304, R.dtype("float16"))
        cls.concatenate(reshape790, reshape791, reshape792, alloc761)
        R.vm.kill_object(reshape790)
        R.vm.kill_object(reshape791)
        R.vm.kill_object(reshape792)
        gv1305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape793: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc761, gv1305, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc761)
        gv1306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1306, R.dtype("float16"))
        _760: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape793, alloc762)
        R.vm.kill_object(reshape793)
        gv1307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape794: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc762, gv1307, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc762)
        gv1308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape795: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape794, gv1308, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape794)
        model_decoder_layers_8_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[686]
        model_decoder_layers_8_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[687]
        gv1309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1309, R.dtype("float16"))
        _761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_out_proj_weight3, reshape795, model_decoder_layers_8_self_attn_out_proj_bias3, alloc763)
        R.vm.kill_object(reshape795)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias3)
        gv1310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1310, R.dtype("float16"))
        cls.add(alloc756, alloc763, alloc764)
        R.vm.kill_object(alloc756)
        R.vm.kill_object(alloc763)
        model_decoder_layers_8_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[697]
        model_decoder_layers_8_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[698]
        gv1311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1311, R.dtype("float16"))
        cls.layer_norm(alloc764, model_decoder_layers_8_encoder_attn_layer_norm_weight3, model_decoder_layers_8_encoder_attn_layer_norm_bias3, alloc765)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias3)
        model_decoder_layers_8_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[693]
        model_decoder_layers_8_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[694]
        gv1312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1312, R.dtype("float16"))
        _764: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight3, alloc765, model_decoder_layers_8_encoder_attn_q_proj_bias3, alloc766)
        R.vm.kill_object(alloc765)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias3)
        gv1313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape796: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc766, gv1313, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc766)
        gv1314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape797: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape796, gv1314, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape796)
        gv1315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1315, R.dtype("float16"))
        _765: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape797, alloc767)
        R.vm.kill_object(reshape797)
        gv1316: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape798: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc767, gv1316, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc767)
        gv1317: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape799: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape798, gv1317, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape798)
        model_decoder_layers_8_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[695]
        model_decoder_layers_8_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[696]
        gv1318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1318, R.dtype("float16"))
        _766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight3, reshape799, model_decoder_layers_8_encoder_attn_out_proj_bias3, alloc768)
        R.vm.kill_object(reshape799)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias3)
        gv1319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1319, R.dtype("float16"))
        cls.add(alloc764, alloc768, alloc769)
        R.vm.kill_object(alloc764)
        R.vm.kill_object(alloc768)
        model_decoder_layers_8_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[703]
        model_decoder_layers_8_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[704]
        gv1320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1320, R.dtype("float16"))
        cls.layer_norm(alloc769, model_decoder_layers_8_final_layer_norm_weight3, model_decoder_layers_8_final_layer_norm_bias3, alloc770)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias3)
        model_decoder_layers_8_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[699]
        model_decoder_layers_8_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[700]
        gv1321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1321, R.dtype("float16"))
        _769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_8_fc1_weight3, alloc770, model_decoder_layers_8_fc1_bias3, alloc771)
        R.vm.kill_object(alloc770)
        R.vm.kill_object(model_decoder_layers_8_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_8_fc1_bias3)
        model_decoder_layers_8_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[701]
        model_decoder_layers_8_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[702]
        gv1322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1322, R.dtype("float16"))
        _770: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_8_fc2_weight3, alloc771, model_decoder_layers_8_fc2_bias3, alloc772)
        R.vm.kill_object(alloc771)
        R.vm.kill_object(model_decoder_layers_8_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_8_fc2_bias3)
        gv1323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1323, R.dtype("float16"))
        cls.add(alloc769, alloc772, alloc773)
        R.vm.kill_object(alloc769)
        R.vm.kill_object(alloc772)
        model_decoder_layers_9_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[712]
        model_decoder_layers_9_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[713]
        gv1324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1324, R.dtype("float16"))
        cls.layer_norm(alloc773, model_decoder_layers_9_self_attn_layer_norm_weight3, model_decoder_layers_9_self_attn_layer_norm_bias3, alloc774)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias3)
        model_decoder_layers_9_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[708]
        model_decoder_layers_9_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[709]
        gv1325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1325, R.dtype("float16"))
        _773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_q_proj_weight3, alloc774, model_decoder_layers_9_self_attn_q_proj_bias3, alloc775)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias3)
        gv1326: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape800: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc775, gv1326, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc775)
        model_decoder_layers_9_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[705]
        gv1327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1327, R.dtype("float16"))
        _774: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_9_self_attn_k_proj_weight3, alloc774, alloc776)
        R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight3)
        gv1328: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape801: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc776, gv1328, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc776)
        model_decoder_layers_9_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[706]
        model_decoder_layers_9_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[707]
        gv1329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1329, R.dtype("float16"))
        _775: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_v_proj_weight3, alloc774, model_decoder_layers_9_self_attn_v_proj_bias3, alloc777)
        R.vm.kill_object(alloc774)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias3)
        gv1330: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape802: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc777, gv1330, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc777)
        gv1331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc778: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1331, R.dtype("float16"))
        cls.concatenate(reshape800, reshape801, reshape802, alloc778)
        R.vm.kill_object(reshape800)
        R.vm.kill_object(reshape801)
        R.vm.kill_object(reshape802)
        gv1332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape803: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc778, gv1332, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc778)
        gv1333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1333, R.dtype("float16"))
        _777: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape803, alloc779)
        R.vm.kill_object(reshape803)
        gv1334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape804: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc779, gv1334, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc779)
        gv1335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape805: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape804, gv1335, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape804)
        model_decoder_layers_9_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[710]
        model_decoder_layers_9_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[711]
        gv1336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1336, R.dtype("float16"))
        _778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_out_proj_weight3, reshape805, model_decoder_layers_9_self_attn_out_proj_bias3, alloc780)
        R.vm.kill_object(reshape805)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias3)
        gv1337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1337, R.dtype("float16"))
        cls.add(alloc773, alloc780, alloc781)
        R.vm.kill_object(alloc773)
        R.vm.kill_object(alloc780)
        model_decoder_layers_9_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[721]
        model_decoder_layers_9_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[722]
        gv1338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1338, R.dtype("float16"))
        cls.layer_norm(alloc781, model_decoder_layers_9_encoder_attn_layer_norm_weight3, model_decoder_layers_9_encoder_attn_layer_norm_bias3, alloc782)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias3)
        model_decoder_layers_9_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[717]
        model_decoder_layers_9_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[718]
        gv1339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1339, R.dtype("float16"))
        _781: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight3, alloc782, model_decoder_layers_9_encoder_attn_q_proj_bias3, alloc783)
        R.vm.kill_object(alloc782)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias3)
        gv1340: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape806: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc783, gv1340, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc783)
        gv1341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape807: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape806, gv1341, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape806)
        gv1342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1342, R.dtype("float16"))
        _782: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape807, alloc784)
        R.vm.kill_object(reshape807)
        gv1343: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape808: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc784, gv1343, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc784)
        gv1344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape809: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape808, gv1344, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape808)
        model_decoder_layers_9_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[719]
        model_decoder_layers_9_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[720]
        gv1345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1345, R.dtype("float16"))
        _783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight3, reshape809, model_decoder_layers_9_encoder_attn_out_proj_bias3, alloc785)
        R.vm.kill_object(reshape809)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias3)
        gv1346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1346, R.dtype("float16"))
        cls.add(alloc781, alloc785, alloc786)
        R.vm.kill_object(alloc781)
        R.vm.kill_object(alloc785)
        model_decoder_layers_9_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[727]
        model_decoder_layers_9_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[728]
        gv1347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1347, R.dtype("float16"))
        cls.layer_norm(alloc786, model_decoder_layers_9_final_layer_norm_weight3, model_decoder_layers_9_final_layer_norm_bias3, alloc787)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias3)
        model_decoder_layers_9_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[723]
        model_decoder_layers_9_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[724]
        gv1348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1348, R.dtype("float16"))
        _786: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_9_fc1_weight3, alloc787, model_decoder_layers_9_fc1_bias3, alloc788)
        R.vm.kill_object(alloc787)
        R.vm.kill_object(model_decoder_layers_9_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_9_fc1_bias3)
        model_decoder_layers_9_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[725]
        model_decoder_layers_9_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[726]
        gv1349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1349, R.dtype("float16"))
        _787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_9_fc2_weight3, alloc788, model_decoder_layers_9_fc2_bias3, alloc789)
        R.vm.kill_object(alloc788)
        R.vm.kill_object(model_decoder_layers_9_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_9_fc2_bias3)
        gv1350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1350, R.dtype("float16"))
        cls.add(alloc786, alloc789, alloc790)
        R.vm.kill_object(alloc786)
        R.vm.kill_object(alloc789)
        model_decoder_layers_10_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[736]
        model_decoder_layers_10_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[737]
        gv1351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1351, R.dtype("float16"))
        cls.layer_norm(alloc790, model_decoder_layers_10_self_attn_layer_norm_weight3, model_decoder_layers_10_self_attn_layer_norm_bias3, alloc791)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias3)
        model_decoder_layers_10_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[732]
        model_decoder_layers_10_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[733]
        gv1352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1352, R.dtype("float16"))
        _790: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_q_proj_weight3, alloc791, model_decoder_layers_10_self_attn_q_proj_bias3, alloc792)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias3)
        gv1353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape810: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc792, gv1353, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc792)
        model_decoder_layers_10_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[729]
        gv1354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1354, R.dtype("float16"))
        _791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_10_self_attn_k_proj_weight3, alloc791, alloc793)
        R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight3)
        gv1355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape811: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc793, gv1355, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc793)
        model_decoder_layers_10_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[730]
        model_decoder_layers_10_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[731]
        gv1356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1356, R.dtype("float16"))
        _792: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_v_proj_weight3, alloc791, model_decoder_layers_10_self_attn_v_proj_bias3, alloc794)
        R.vm.kill_object(alloc791)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias3)
        gv1357: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape812: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc794, gv1357, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc794)
        gv1358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc795: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1358, R.dtype("float16"))
        cls.concatenate(reshape810, reshape811, reshape812, alloc795)
        R.vm.kill_object(reshape810)
        R.vm.kill_object(reshape811)
        R.vm.kill_object(reshape812)
        gv1359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape813: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc795, gv1359, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc795)
        gv1360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1360, R.dtype("float16"))
        _794: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape813, alloc796)
        R.vm.kill_object(reshape813)
        gv1361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape814: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc796, gv1361, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc796)
        gv1362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape815: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape814, gv1362, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape814)
        model_decoder_layers_10_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[734]
        model_decoder_layers_10_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[735]
        gv1363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1363, R.dtype("float16"))
        _795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_out_proj_weight3, reshape815, model_decoder_layers_10_self_attn_out_proj_bias3, alloc797)
        R.vm.kill_object(reshape815)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias3)
        gv1364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1364, R.dtype("float16"))
        cls.add(alloc790, alloc797, alloc798)
        R.vm.kill_object(alloc790)
        R.vm.kill_object(alloc797)
        model_decoder_layers_10_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[745]
        model_decoder_layers_10_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[746]
        gv1365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1365, R.dtype("float16"))
        cls.layer_norm(alloc798, model_decoder_layers_10_encoder_attn_layer_norm_weight3, model_decoder_layers_10_encoder_attn_layer_norm_bias3, alloc799)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias3)
        model_decoder_layers_10_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[741]
        model_decoder_layers_10_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[742]
        gv1366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1366, R.dtype("float16"))
        _798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight3, alloc799, model_decoder_layers_10_encoder_attn_q_proj_bias3, alloc800)
        R.vm.kill_object(alloc799)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias3)
        gv1367: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape816: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc800, gv1367, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc800)
        gv1368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape817: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape816, gv1368, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape816)
        gv1369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1369, R.dtype("float16"))
        _799: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape817, alloc801)
        R.vm.kill_object(reshape817)
        gv1370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape818: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc801, gv1370, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc801)
        gv1371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape819: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape818, gv1371, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape818)
        model_decoder_layers_10_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[743]
        model_decoder_layers_10_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[744]
        gv1372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1372, R.dtype("float16"))
        _800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight3, reshape819, model_decoder_layers_10_encoder_attn_out_proj_bias3, alloc802)
        R.vm.kill_object(reshape819)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias3)
        gv1373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1373, R.dtype("float16"))
        cls.add(alloc798, alloc802, alloc803)
        R.vm.kill_object(alloc798)
        R.vm.kill_object(alloc802)
        model_decoder_layers_10_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[751]
        model_decoder_layers_10_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[752]
        gv1374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1374, R.dtype("float16"))
        cls.layer_norm(alloc803, model_decoder_layers_10_final_layer_norm_weight3, model_decoder_layers_10_final_layer_norm_bias3, alloc804)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias3)
        model_decoder_layers_10_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[747]
        model_decoder_layers_10_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[748]
        gv1375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1375, R.dtype("float16"))
        _803: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_10_fc1_weight3, alloc804, model_decoder_layers_10_fc1_bias3, alloc805)
        R.vm.kill_object(alloc804)
        R.vm.kill_object(model_decoder_layers_10_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_10_fc1_bias3)
        model_decoder_layers_10_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[749]
        model_decoder_layers_10_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[750]
        gv1376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1376, R.dtype("float16"))
        _804: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_10_fc2_weight3, alloc805, model_decoder_layers_10_fc2_bias3, alloc806)
        R.vm.kill_object(alloc805)
        R.vm.kill_object(model_decoder_layers_10_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_10_fc2_bias3)
        gv1377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1377, R.dtype("float16"))
        cls.add(alloc803, alloc806, alloc807)
        R.vm.kill_object(alloc803)
        R.vm.kill_object(alloc806)
        model_decoder_layers_11_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[760]
        model_decoder_layers_11_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[761]
        gv1378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1378, R.dtype("float16"))
        cls.layer_norm(alloc807, model_decoder_layers_11_self_attn_layer_norm_weight3, model_decoder_layers_11_self_attn_layer_norm_bias3, alloc808)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias3)
        model_decoder_layers_11_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[756]
        model_decoder_layers_11_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[757]
        gv1379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1379, R.dtype("float16"))
        _807: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_q_proj_weight3, alloc808, model_decoder_layers_11_self_attn_q_proj_bias3, alloc809)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias3)
        gv1380: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape820: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc809, gv1380, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc809)
        model_decoder_layers_11_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[753]
        gv1381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1381, R.dtype("float16"))
        _808: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_11_self_attn_k_proj_weight3, alloc808, alloc810)
        R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight3)
        gv1382: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape821: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc810, gv1382, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc810)
        model_decoder_layers_11_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[754]
        model_decoder_layers_11_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[755]
        gv1383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1383, R.dtype("float16"))
        _809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_v_proj_weight3, alloc808, model_decoder_layers_11_self_attn_v_proj_bias3, alloc811)
        R.vm.kill_object(alloc808)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias3)
        gv1384: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape822: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc811, gv1384, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc811)
        gv1385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc812: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1385, R.dtype("float16"))
        cls.concatenate(reshape820, reshape821, reshape822, alloc812)
        R.vm.kill_object(reshape820)
        R.vm.kill_object(reshape821)
        R.vm.kill_object(reshape822)
        gv1386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape823: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc812, gv1386, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc812)
        gv1387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1387, R.dtype("float16"))
        _811: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape823, alloc813)
        R.vm.kill_object(reshape823)
        gv1388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape824: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc813, gv1388, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc813)
        gv1389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape825: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape824, gv1389, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape824)
        model_decoder_layers_11_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[758]
        model_decoder_layers_11_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[759]
        gv1390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1390, R.dtype("float16"))
        _812: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_out_proj_weight3, reshape825, model_decoder_layers_11_self_attn_out_proj_bias3, alloc814)
        R.vm.kill_object(reshape825)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias3)
        gv1391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1391, R.dtype("float16"))
        cls.add(alloc807, alloc814, alloc815)
        R.vm.kill_object(alloc807)
        R.vm.kill_object(alloc814)
        model_decoder_layers_11_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[769]
        model_decoder_layers_11_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[770]
        gv1392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1392, R.dtype("float16"))
        cls.layer_norm(alloc815, model_decoder_layers_11_encoder_attn_layer_norm_weight3, model_decoder_layers_11_encoder_attn_layer_norm_bias3, alloc816)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias3)
        model_decoder_layers_11_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[765]
        model_decoder_layers_11_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[766]
        gv1393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1393, R.dtype("float16"))
        _815: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight3, alloc816, model_decoder_layers_11_encoder_attn_q_proj_bias3, alloc817)
        R.vm.kill_object(alloc816)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias3)
        gv1394: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape826: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc817, gv1394, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc817)
        gv1395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape827: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape826, gv1395, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape826)
        gv1396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1396, R.dtype("float16"))
        _816: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape827, alloc818)
        R.vm.kill_object(reshape827)
        gv1397: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape828: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc818, gv1397, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc818)
        gv1398: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape829: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape828, gv1398, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape828)
        model_decoder_layers_11_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[767]
        model_decoder_layers_11_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[768]
        gv1399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1399, R.dtype("float16"))
        _817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight3, reshape829, model_decoder_layers_11_encoder_attn_out_proj_bias3, alloc819)
        R.vm.kill_object(reshape829)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias3)
        gv1400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1400, R.dtype("float16"))
        cls.add(alloc815, alloc819, alloc820)
        R.vm.kill_object(alloc815)
        R.vm.kill_object(alloc819)
        model_decoder_layers_11_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[775]
        model_decoder_layers_11_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[776]
        gv1401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1401, R.dtype("float16"))
        cls.layer_norm(alloc820, model_decoder_layers_11_final_layer_norm_weight3, model_decoder_layers_11_final_layer_norm_bias3, alloc821)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias3)
        model_decoder_layers_11_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[771]
        model_decoder_layers_11_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[772]
        gv1402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1402, R.dtype("float16"))
        _820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_11_fc1_weight3, alloc821, model_decoder_layers_11_fc1_bias3, alloc822)
        R.vm.kill_object(alloc821)
        R.vm.kill_object(model_decoder_layers_11_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_11_fc1_bias3)
        model_decoder_layers_11_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[773]
        model_decoder_layers_11_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[774]
        gv1403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1403, R.dtype("float16"))
        _821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_11_fc2_weight3, alloc822, model_decoder_layers_11_fc2_bias3, alloc823)
        R.vm.kill_object(alloc822)
        R.vm.kill_object(model_decoder_layers_11_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_11_fc2_bias3)
        gv1404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1404, R.dtype("float16"))
        cls.add(alloc820, alloc823, alloc824)
        R.vm.kill_object(alloc820)
        R.vm.kill_object(alloc823)
        model_decoder_layers_12_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[784]
        model_decoder_layers_12_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[785]
        gv1405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1405, R.dtype("float16"))
        cls.layer_norm(alloc824, model_decoder_layers_12_self_attn_layer_norm_weight3, model_decoder_layers_12_self_attn_layer_norm_bias3, alloc825)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias3)
        model_decoder_layers_12_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[780]
        model_decoder_layers_12_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[781]
        gv1406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1406, R.dtype("float16"))
        _824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_q_proj_weight3, alloc825, model_decoder_layers_12_self_attn_q_proj_bias3, alloc826)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias3)
        gv1407: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape830: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc826, gv1407, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc826)
        model_decoder_layers_12_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[777]
        gv1408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1408, R.dtype("float16"))
        _825: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_12_self_attn_k_proj_weight3, alloc825, alloc827)
        R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight3)
        gv1409: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape831: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc827, gv1409, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc827)
        model_decoder_layers_12_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[778]
        model_decoder_layers_12_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[779]
        gv1410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1410, R.dtype("float16"))
        _826: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_v_proj_weight3, alloc825, model_decoder_layers_12_self_attn_v_proj_bias3, alloc828)
        R.vm.kill_object(alloc825)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias3)
        gv1411: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape832: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc828, gv1411, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc828)
        gv1412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc829: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1412, R.dtype("float16"))
        cls.concatenate(reshape830, reshape831, reshape832, alloc829)
        R.vm.kill_object(reshape830)
        R.vm.kill_object(reshape831)
        R.vm.kill_object(reshape832)
        gv1413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape833: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc829, gv1413, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc829)
        gv1414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1414, R.dtype("float16"))
        _828: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape833, alloc830)
        R.vm.kill_object(reshape833)
        gv1415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape834: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc830, gv1415, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc830)
        gv1416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape835: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape834, gv1416, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape834)
        model_decoder_layers_12_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[782]
        model_decoder_layers_12_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[783]
        gv1417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1417, R.dtype("float16"))
        _829: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_out_proj_weight3, reshape835, model_decoder_layers_12_self_attn_out_proj_bias3, alloc831)
        R.vm.kill_object(reshape835)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias3)
        gv1418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1418, R.dtype("float16"))
        cls.add(alloc824, alloc831, alloc832)
        R.vm.kill_object(alloc824)
        R.vm.kill_object(alloc831)
        model_decoder_layers_12_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[793]
        model_decoder_layers_12_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[794]
        gv1419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1419, R.dtype("float16"))
        cls.layer_norm(alloc832, model_decoder_layers_12_encoder_attn_layer_norm_weight3, model_decoder_layers_12_encoder_attn_layer_norm_bias3, alloc833)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias3)
        model_decoder_layers_12_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[789]
        model_decoder_layers_12_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[790]
        gv1420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1420, R.dtype("float16"))
        _832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight3, alloc833, model_decoder_layers_12_encoder_attn_q_proj_bias3, alloc834)
        R.vm.kill_object(alloc833)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias3)
        gv1421: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape836: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc834, gv1421, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc834)
        gv1422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape837: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape836, gv1422, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape836)
        gv1423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1423, R.dtype("float16"))
        _833: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape837, alloc835)
        R.vm.kill_object(reshape837)
        gv1424: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape838: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc835, gv1424, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc835)
        gv1425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape839: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape838, gv1425, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape838)
        model_decoder_layers_12_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[791]
        model_decoder_layers_12_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[792]
        gv1426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1426, R.dtype("float16"))
        _834: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight3, reshape839, model_decoder_layers_12_encoder_attn_out_proj_bias3, alloc836)
        R.vm.kill_object(reshape839)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias3)
        gv1427: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1427, R.dtype("float16"))
        cls.add(alloc832, alloc836, alloc837)
        R.vm.kill_object(alloc832)
        R.vm.kill_object(alloc836)
        model_decoder_layers_12_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[799]
        model_decoder_layers_12_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[800]
        gv1428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1428, R.dtype("float16"))
        cls.layer_norm(alloc837, model_decoder_layers_12_final_layer_norm_weight3, model_decoder_layers_12_final_layer_norm_bias3, alloc838)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias3)
        model_decoder_layers_12_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[795]
        model_decoder_layers_12_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[796]
        gv1429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1429, R.dtype("float16"))
        _837: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_12_fc1_weight3, alloc838, model_decoder_layers_12_fc1_bias3, alloc839)
        R.vm.kill_object(alloc838)
        R.vm.kill_object(model_decoder_layers_12_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_12_fc1_bias3)
        model_decoder_layers_12_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[797]
        model_decoder_layers_12_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[798]
        gv1430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1430, R.dtype("float16"))
        _838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_12_fc2_weight3, alloc839, model_decoder_layers_12_fc2_bias3, alloc840)
        R.vm.kill_object(alloc839)
        R.vm.kill_object(model_decoder_layers_12_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_12_fc2_bias3)
        gv1431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1431, R.dtype("float16"))
        cls.add(alloc837, alloc840, alloc841)
        R.vm.kill_object(alloc837)
        R.vm.kill_object(alloc840)
        model_decoder_layers_13_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[808]
        model_decoder_layers_13_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[809]
        gv1432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1432, R.dtype("float16"))
        cls.layer_norm(alloc841, model_decoder_layers_13_self_attn_layer_norm_weight3, model_decoder_layers_13_self_attn_layer_norm_bias3, alloc842)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias3)
        model_decoder_layers_13_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[804]
        model_decoder_layers_13_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[805]
        gv1433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1433, R.dtype("float16"))
        _841: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_q_proj_weight3, alloc842, model_decoder_layers_13_self_attn_q_proj_bias3, alloc843)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias3)
        gv1434: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape840: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc843, gv1434, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc843)
        model_decoder_layers_13_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[801]
        gv1435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1435, R.dtype("float16"))
        _842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_13_self_attn_k_proj_weight3, alloc842, alloc844)
        R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight3)
        gv1436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape841: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc844, gv1436, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc844)
        model_decoder_layers_13_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[802]
        model_decoder_layers_13_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[803]
        gv1437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1437, R.dtype("float16"))
        _843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_v_proj_weight3, alloc842, model_decoder_layers_13_self_attn_v_proj_bias3, alloc845)
        R.vm.kill_object(alloc842)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias3)
        gv1438: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape842: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc845, gv1438, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc845)
        gv1439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc846: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1439, R.dtype("float16"))
        cls.concatenate(reshape840, reshape841, reshape842, alloc846)
        R.vm.kill_object(reshape840)
        R.vm.kill_object(reshape841)
        R.vm.kill_object(reshape842)
        gv1440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape843: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc846, gv1440, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc846)
        gv1441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1441, R.dtype("float16"))
        _845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape843, alloc847)
        R.vm.kill_object(reshape843)
        gv1442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape844: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc847, gv1442, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc847)
        gv1443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape845: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape844, gv1443, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape844)
        model_decoder_layers_13_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[806]
        model_decoder_layers_13_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[807]
        gv1444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1444, R.dtype("float16"))
        _846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_out_proj_weight3, reshape845, model_decoder_layers_13_self_attn_out_proj_bias3, alloc848)
        R.vm.kill_object(reshape845)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias3)
        gv1445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1445, R.dtype("float16"))
        cls.add(alloc841, alloc848, alloc849)
        R.vm.kill_object(alloc841)
        R.vm.kill_object(alloc848)
        model_decoder_layers_13_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[817]
        model_decoder_layers_13_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[818]
        gv1446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1446, R.dtype("float16"))
        cls.layer_norm(alloc849, model_decoder_layers_13_encoder_attn_layer_norm_weight3, model_decoder_layers_13_encoder_attn_layer_norm_bias3, alloc850)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias3)
        model_decoder_layers_13_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[813]
        model_decoder_layers_13_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[814]
        gv1447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1447, R.dtype("float16"))
        _849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight3, alloc850, model_decoder_layers_13_encoder_attn_q_proj_bias3, alloc851)
        R.vm.kill_object(alloc850)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias3)
        gv1448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape846: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc851, gv1448, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc851)
        gv1449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape847: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape846, gv1449, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape846)
        gv1450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1450, R.dtype("float16"))
        _850: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape847, alloc852)
        R.vm.kill_object(reshape847)
        gv1451: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape848: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc852, gv1451, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc852)
        gv1452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape849: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape848, gv1452, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape848)
        model_decoder_layers_13_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[815]
        model_decoder_layers_13_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[816]
        gv1453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1453, R.dtype("float16"))
        _851: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight3, reshape849, model_decoder_layers_13_encoder_attn_out_proj_bias3, alloc853)
        R.vm.kill_object(reshape849)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias3)
        gv1454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1454, R.dtype("float16"))
        cls.add(alloc849, alloc853, alloc854)
        R.vm.kill_object(alloc849)
        R.vm.kill_object(alloc853)
        model_decoder_layers_13_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[823]
        model_decoder_layers_13_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[824]
        gv1455: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1455, R.dtype("float16"))
        cls.layer_norm(alloc854, model_decoder_layers_13_final_layer_norm_weight3, model_decoder_layers_13_final_layer_norm_bias3, alloc855)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias3)
        model_decoder_layers_13_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[819]
        model_decoder_layers_13_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[820]
        gv1456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1456, R.dtype("float16"))
        _854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_13_fc1_weight3, alloc855, model_decoder_layers_13_fc1_bias3, alloc856)
        R.vm.kill_object(alloc855)
        R.vm.kill_object(model_decoder_layers_13_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_13_fc1_bias3)
        model_decoder_layers_13_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[821]
        model_decoder_layers_13_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[822]
        gv1457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1457, R.dtype("float16"))
        _855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_13_fc2_weight3, alloc856, model_decoder_layers_13_fc2_bias3, alloc857)
        R.vm.kill_object(alloc856)
        R.vm.kill_object(model_decoder_layers_13_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_13_fc2_bias3)
        gv1458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1458, R.dtype("float16"))
        cls.add(alloc854, alloc857, alloc858)
        R.vm.kill_object(alloc854)
        R.vm.kill_object(alloc857)
        model_decoder_layers_14_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[832]
        model_decoder_layers_14_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[833]
        gv1459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1459, R.dtype("float16"))
        cls.layer_norm(alloc858, model_decoder_layers_14_self_attn_layer_norm_weight3, model_decoder_layers_14_self_attn_layer_norm_bias3, alloc859)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias3)
        model_decoder_layers_14_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[828]
        model_decoder_layers_14_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[829]
        gv1460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1460, R.dtype("float16"))
        _858: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_q_proj_weight3, alloc859, model_decoder_layers_14_self_attn_q_proj_bias3, alloc860)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias3)
        gv1461: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape850: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc860, gv1461, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc860)
        model_decoder_layers_14_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[825]
        gv1462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1462, R.dtype("float16"))
        _859: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_14_self_attn_k_proj_weight3, alloc859, alloc861)
        R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight3)
        gv1463: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape851: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc861, gv1463, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc861)
        model_decoder_layers_14_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[826]
        model_decoder_layers_14_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[827]
        gv1464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1464, R.dtype("float16"))
        _860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_v_proj_weight3, alloc859, model_decoder_layers_14_self_attn_v_proj_bias3, alloc862)
        R.vm.kill_object(alloc859)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias3)
        gv1465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape852: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc862, gv1465, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc862)
        gv1466: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc863: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1466, R.dtype("float16"))
        cls.concatenate(reshape850, reshape851, reshape852, alloc863)
        R.vm.kill_object(reshape850)
        R.vm.kill_object(reshape851)
        R.vm.kill_object(reshape852)
        gv1467: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape853: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc863, gv1467, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc863)
        gv1468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1468, R.dtype("float16"))
        _862: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape853, alloc864)
        R.vm.kill_object(reshape853)
        gv1469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape854: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc864, gv1469, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc864)
        gv1470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape855: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape854, gv1470, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape854)
        model_decoder_layers_14_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[830]
        model_decoder_layers_14_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[831]
        gv1471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1471, R.dtype("float16"))
        _863: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_out_proj_weight3, reshape855, model_decoder_layers_14_self_attn_out_proj_bias3, alloc865)
        R.vm.kill_object(reshape855)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias3)
        gv1472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1472, R.dtype("float16"))
        cls.add(alloc858, alloc865, alloc866)
        R.vm.kill_object(alloc858)
        R.vm.kill_object(alloc865)
        model_decoder_layers_14_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[841]
        model_decoder_layers_14_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[842]
        gv1473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1473, R.dtype("float16"))
        cls.layer_norm(alloc866, model_decoder_layers_14_encoder_attn_layer_norm_weight3, model_decoder_layers_14_encoder_attn_layer_norm_bias3, alloc867)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias3)
        model_decoder_layers_14_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[837]
        model_decoder_layers_14_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[838]
        gv1474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1474, R.dtype("float16"))
        _866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight3, alloc867, model_decoder_layers_14_encoder_attn_q_proj_bias3, alloc868)
        R.vm.kill_object(alloc867)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias3)
        gv1475: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape856: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc868, gv1475, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc868)
        gv1476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape857: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape856, gv1476, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape856)
        gv1477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1477, R.dtype("float16"))
        _867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape857, alloc869)
        R.vm.kill_object(reshape857)
        gv1478: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape858: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc869, gv1478, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc869)
        gv1479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape859: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape858, gv1479, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape858)
        model_decoder_layers_14_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[839]
        model_decoder_layers_14_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[840]
        gv1480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1480, R.dtype("float16"))
        _868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight3, reshape859, model_decoder_layers_14_encoder_attn_out_proj_bias3, alloc870)
        R.vm.kill_object(reshape859)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias3)
        gv1481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1481, R.dtype("float16"))
        cls.add(alloc866, alloc870, alloc871)
        R.vm.kill_object(alloc866)
        R.vm.kill_object(alloc870)
        model_decoder_layers_14_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[847]
        model_decoder_layers_14_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[848]
        gv1482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1482, R.dtype("float16"))
        cls.layer_norm(alloc871, model_decoder_layers_14_final_layer_norm_weight3, model_decoder_layers_14_final_layer_norm_bias3, alloc872)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias3)
        model_decoder_layers_14_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[843]
        model_decoder_layers_14_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[844]
        gv1483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1483, R.dtype("float16"))
        _871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_14_fc1_weight3, alloc872, model_decoder_layers_14_fc1_bias3, alloc873)
        R.vm.kill_object(alloc872)
        R.vm.kill_object(model_decoder_layers_14_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_14_fc1_bias3)
        model_decoder_layers_14_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[845]
        model_decoder_layers_14_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[846]
        gv1484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1484, R.dtype("float16"))
        _872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_14_fc2_weight3, alloc873, model_decoder_layers_14_fc2_bias3, alloc874)
        R.vm.kill_object(alloc873)
        R.vm.kill_object(model_decoder_layers_14_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_14_fc2_bias3)
        gv1485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1485, R.dtype("float16"))
        cls.add(alloc871, alloc874, alloc875)
        R.vm.kill_object(alloc871)
        R.vm.kill_object(alloc874)
        model_decoder_layers_15_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[856]
        model_decoder_layers_15_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[857]
        gv1486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1486, R.dtype("float16"))
        cls.layer_norm(alloc875, model_decoder_layers_15_self_attn_layer_norm_weight3, model_decoder_layers_15_self_attn_layer_norm_bias3, alloc876)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias3)
        model_decoder_layers_15_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[852]
        model_decoder_layers_15_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[853]
        gv1487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1487, R.dtype("float16"))
        _875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_q_proj_weight3, alloc876, model_decoder_layers_15_self_attn_q_proj_bias3, alloc877)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias3)
        gv1488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape860: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc877, gv1488, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc877)
        model_decoder_layers_15_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[849]
        gv1489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1489, R.dtype("float16"))
        _876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_15_self_attn_k_proj_weight3, alloc876, alloc878)
        R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight3)
        gv1490: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape861: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc878, gv1490, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc878)
        model_decoder_layers_15_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[850]
        model_decoder_layers_15_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[851]
        gv1491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1491, R.dtype("float16"))
        _877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_v_proj_weight3, alloc876, model_decoder_layers_15_self_attn_v_proj_bias3, alloc879)
        R.vm.kill_object(alloc876)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias3)
        gv1492: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape862: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc879, gv1492, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc879)
        gv1493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc880: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1493, R.dtype("float16"))
        cls.concatenate(reshape860, reshape861, reshape862, alloc880)
        R.vm.kill_object(reshape860)
        R.vm.kill_object(reshape861)
        R.vm.kill_object(reshape862)
        gv1494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape863: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc880, gv1494, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc880)
        gv1495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1495, R.dtype("float16"))
        _879: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape863, alloc881)
        R.vm.kill_object(reshape863)
        gv1496: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape864: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc881, gv1496, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc881)
        gv1497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape865: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape864, gv1497, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape864)
        model_decoder_layers_15_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[854]
        model_decoder_layers_15_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[855]
        gv1498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1498, R.dtype("float16"))
        _880: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_out_proj_weight3, reshape865, model_decoder_layers_15_self_attn_out_proj_bias3, alloc882)
        R.vm.kill_object(reshape865)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias3)
        gv1499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1499, R.dtype("float16"))
        cls.add(alloc875, alloc882, alloc883)
        R.vm.kill_object(alloc875)
        R.vm.kill_object(alloc882)
        model_decoder_layers_15_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[865]
        model_decoder_layers_15_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[866]
        gv1500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1500, R.dtype("float16"))
        cls.layer_norm(alloc883, model_decoder_layers_15_encoder_attn_layer_norm_weight3, model_decoder_layers_15_encoder_attn_layer_norm_bias3, alloc884)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias3)
        model_decoder_layers_15_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[861]
        model_decoder_layers_15_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[862]
        gv1501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1501, R.dtype("float16"))
        _883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight3, alloc884, model_decoder_layers_15_encoder_attn_q_proj_bias3, alloc885)
        R.vm.kill_object(alloc884)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias3)
        gv1502: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape866: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc885, gv1502, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc885)
        gv1503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape867: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape866, gv1503, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape866)
        gv1504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1504, R.dtype("float16"))
        _884: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape867, alloc886)
        R.vm.kill_object(reshape867)
        gv1505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape868: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc886, gv1505, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc886)
        gv1506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape869: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape868, gv1506, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape868)
        model_decoder_layers_15_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[863]
        model_decoder_layers_15_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[864]
        gv1507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1507, R.dtype("float16"))
        _885: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight3, reshape869, model_decoder_layers_15_encoder_attn_out_proj_bias3, alloc887)
        R.vm.kill_object(reshape869)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias3)
        gv1508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1508, R.dtype("float16"))
        cls.add(alloc883, alloc887, alloc888)
        R.vm.kill_object(alloc883)
        R.vm.kill_object(alloc887)
        model_decoder_layers_15_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[871]
        model_decoder_layers_15_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[872]
        gv1509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1509, R.dtype("float16"))
        cls.layer_norm(alloc888, model_decoder_layers_15_final_layer_norm_weight3, model_decoder_layers_15_final_layer_norm_bias3, alloc889)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias3)
        model_decoder_layers_15_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[867]
        model_decoder_layers_15_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[868]
        gv1510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1510, R.dtype("float16"))
        _888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_15_fc1_weight3, alloc889, model_decoder_layers_15_fc1_bias3, alloc890)
        R.vm.kill_object(alloc889)
        R.vm.kill_object(model_decoder_layers_15_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_15_fc1_bias3)
        model_decoder_layers_15_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[869]
        model_decoder_layers_15_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[870]
        gv1511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1511, R.dtype("float16"))
        _889: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_15_fc2_weight3, alloc890, model_decoder_layers_15_fc2_bias3, alloc891)
        R.vm.kill_object(alloc890)
        R.vm.kill_object(model_decoder_layers_15_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_15_fc2_bias3)
        gv1512: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1512, R.dtype("float16"))
        cls.add(alloc888, alloc891, alloc892)
        R.vm.kill_object(alloc888)
        R.vm.kill_object(alloc891)
        model_decoder_layers_16_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[880]
        model_decoder_layers_16_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[881]
        gv1513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1513, R.dtype("float16"))
        cls.layer_norm(alloc892, model_decoder_layers_16_self_attn_layer_norm_weight3, model_decoder_layers_16_self_attn_layer_norm_bias3, alloc893)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias3)
        model_decoder_layers_16_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[876]
        model_decoder_layers_16_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[877]
        gv1514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1514, R.dtype("float16"))
        _892: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_q_proj_weight3, alloc893, model_decoder_layers_16_self_attn_q_proj_bias3, alloc894)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias3)
        gv1515: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape870: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc894, gv1515, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc894)
        model_decoder_layers_16_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[873]
        gv1516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1516, R.dtype("float16"))
        _893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_16_self_attn_k_proj_weight3, alloc893, alloc895)
        R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight3)
        gv1517: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape871: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc895, gv1517, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc895)
        model_decoder_layers_16_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[874]
        model_decoder_layers_16_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[875]
        gv1518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1518, R.dtype("float16"))
        _894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_v_proj_weight3, alloc893, model_decoder_layers_16_self_attn_v_proj_bias3, alloc896)
        R.vm.kill_object(alloc893)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias3)
        gv1519: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape872: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc896, gv1519, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc896)
        gv1520: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc897: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1520, R.dtype("float16"))
        cls.concatenate(reshape870, reshape871, reshape872, alloc897)
        R.vm.kill_object(reshape870)
        R.vm.kill_object(reshape871)
        R.vm.kill_object(reshape872)
        gv1521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape873: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc897, gv1521, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc897)
        gv1522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1522, R.dtype("float16"))
        _896: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape873, alloc898)
        R.vm.kill_object(reshape873)
        gv1523: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape874: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc898, gv1523, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc898)
        gv1524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape875: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape874, gv1524, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape874)
        model_decoder_layers_16_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[878]
        model_decoder_layers_16_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[879]
        gv1525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1525, R.dtype("float16"))
        _897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_out_proj_weight3, reshape875, model_decoder_layers_16_self_attn_out_proj_bias3, alloc899)
        R.vm.kill_object(reshape875)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias3)
        gv1526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1526, R.dtype("float16"))
        cls.add(alloc892, alloc899, alloc900)
        R.vm.kill_object(alloc892)
        R.vm.kill_object(alloc899)
        model_decoder_layers_16_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[889]
        model_decoder_layers_16_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[890]
        gv1527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1527, R.dtype("float16"))
        cls.layer_norm(alloc900, model_decoder_layers_16_encoder_attn_layer_norm_weight3, model_decoder_layers_16_encoder_attn_layer_norm_bias3, alloc901)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias3)
        model_decoder_layers_16_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[885]
        model_decoder_layers_16_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[886]
        gv1528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1528, R.dtype("float16"))
        _900: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight3, alloc901, model_decoder_layers_16_encoder_attn_q_proj_bias3, alloc902)
        R.vm.kill_object(alloc901)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias3)
        gv1529: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape876: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc902, gv1529, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc902)
        gv1530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape877: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape876, gv1530, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape876)
        gv1531: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1531, R.dtype("float16"))
        _901: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape877, alloc903)
        R.vm.kill_object(reshape877)
        gv1532: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape878: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc903, gv1532, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc903)
        gv1533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape879: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape878, gv1533, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape878)
        model_decoder_layers_16_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[887]
        model_decoder_layers_16_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[888]
        gv1534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1534, R.dtype("float16"))
        _902: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight3, reshape879, model_decoder_layers_16_encoder_attn_out_proj_bias3, alloc904)
        R.vm.kill_object(reshape879)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias3)
        gv1535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1535, R.dtype("float16"))
        cls.add(alloc900, alloc904, alloc905)
        R.vm.kill_object(alloc900)
        R.vm.kill_object(alloc904)
        model_decoder_layers_16_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[895]
        model_decoder_layers_16_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[896]
        gv1536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1536, R.dtype("float16"))
        cls.layer_norm(alloc905, model_decoder_layers_16_final_layer_norm_weight3, model_decoder_layers_16_final_layer_norm_bias3, alloc906)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias3)
        model_decoder_layers_16_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[891]
        model_decoder_layers_16_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[892]
        gv1537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1537, R.dtype("float16"))
        _905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_16_fc1_weight3, alloc906, model_decoder_layers_16_fc1_bias3, alloc907)
        R.vm.kill_object(alloc906)
        R.vm.kill_object(model_decoder_layers_16_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_16_fc1_bias3)
        model_decoder_layers_16_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[893]
        model_decoder_layers_16_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[894]
        gv1538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1538, R.dtype("float16"))
        _906: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_16_fc2_weight3, alloc907, model_decoder_layers_16_fc2_bias3, alloc908)
        R.vm.kill_object(alloc907)
        R.vm.kill_object(model_decoder_layers_16_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_16_fc2_bias3)
        gv1539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1539, R.dtype("float16"))
        cls.add(alloc905, alloc908, alloc909)
        R.vm.kill_object(alloc905)
        R.vm.kill_object(alloc908)
        model_decoder_layers_17_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[904]
        model_decoder_layers_17_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[905]
        gv1540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1540, R.dtype("float16"))
        cls.layer_norm(alloc909, model_decoder_layers_17_self_attn_layer_norm_weight3, model_decoder_layers_17_self_attn_layer_norm_bias3, alloc910)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias3)
        model_decoder_layers_17_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[900]
        model_decoder_layers_17_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[901]
        gv1541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1541, R.dtype("float16"))
        _909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_q_proj_weight3, alloc910, model_decoder_layers_17_self_attn_q_proj_bias3, alloc911)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias3)
        gv1542: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape880: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc911, gv1542, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc911)
        model_decoder_layers_17_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[897]
        gv1543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1543, R.dtype("float16"))
        _910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_17_self_attn_k_proj_weight3, alloc910, alloc912)
        R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight3)
        gv1544: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape881: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc912, gv1544, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc912)
        model_decoder_layers_17_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[898]
        model_decoder_layers_17_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[899]
        gv1545: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1545, R.dtype("float16"))
        _911: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_v_proj_weight3, alloc910, model_decoder_layers_17_self_attn_v_proj_bias3, alloc913)
        R.vm.kill_object(alloc910)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias3)
        gv1546: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape882: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc913, gv1546, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc913)
        gv1547: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc914: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1547, R.dtype("float16"))
        cls.concatenate(reshape880, reshape881, reshape882, alloc914)
        R.vm.kill_object(reshape880)
        R.vm.kill_object(reshape881)
        R.vm.kill_object(reshape882)
        gv1548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape883: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc914, gv1548, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc914)
        gv1549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1549, R.dtype("float16"))
        _913: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape883, alloc915)
        R.vm.kill_object(reshape883)
        gv1550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape884: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc915, gv1550, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc915)
        gv1551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape885: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape884, gv1551, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape884)
        model_decoder_layers_17_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[902]
        model_decoder_layers_17_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[903]
        gv1552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1552, R.dtype("float16"))
        _914: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_out_proj_weight3, reshape885, model_decoder_layers_17_self_attn_out_proj_bias3, alloc916)
        R.vm.kill_object(reshape885)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias3)
        gv1553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1553, R.dtype("float16"))
        cls.add(alloc909, alloc916, alloc917)
        R.vm.kill_object(alloc909)
        R.vm.kill_object(alloc916)
        model_decoder_layers_17_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[913]
        model_decoder_layers_17_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[914]
        gv1554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1554, R.dtype("float16"))
        cls.layer_norm(alloc917, model_decoder_layers_17_encoder_attn_layer_norm_weight3, model_decoder_layers_17_encoder_attn_layer_norm_bias3, alloc918)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias3)
        model_decoder_layers_17_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[909]
        model_decoder_layers_17_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[910]
        gv1555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1555, R.dtype("float16"))
        _917: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight3, alloc918, model_decoder_layers_17_encoder_attn_q_proj_bias3, alloc919)
        R.vm.kill_object(alloc918)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias3)
        gv1556: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape886: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc919, gv1556, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc919)
        gv1557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape887: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape886, gv1557, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape886)
        gv1558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1558, R.dtype("float16"))
        _918: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape887, alloc920)
        R.vm.kill_object(reshape887)
        gv1559: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape888: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc920, gv1559, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc920)
        gv1560: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape889: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape888, gv1560, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape888)
        model_decoder_layers_17_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[911]
        model_decoder_layers_17_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[912]
        gv1561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1561, R.dtype("float16"))
        _919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight3, reshape889, model_decoder_layers_17_encoder_attn_out_proj_bias3, alloc921)
        R.vm.kill_object(reshape889)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias3)
        gv1562: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1562, R.dtype("float16"))
        cls.add(alloc917, alloc921, alloc922)
        R.vm.kill_object(alloc917)
        R.vm.kill_object(alloc921)
        model_decoder_layers_17_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[919]
        model_decoder_layers_17_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[920]
        gv1563: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1563, R.dtype("float16"))
        cls.layer_norm(alloc922, model_decoder_layers_17_final_layer_norm_weight3, model_decoder_layers_17_final_layer_norm_bias3, alloc923)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias3)
        model_decoder_layers_17_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[915]
        model_decoder_layers_17_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[916]
        gv1564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1564, R.dtype("float16"))
        _922: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_17_fc1_weight3, alloc923, model_decoder_layers_17_fc1_bias3, alloc924)
        R.vm.kill_object(alloc923)
        R.vm.kill_object(model_decoder_layers_17_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_17_fc1_bias3)
        model_decoder_layers_17_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[917]
        model_decoder_layers_17_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[918]
        gv1565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1565, R.dtype("float16"))
        _923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_17_fc2_weight3, alloc924, model_decoder_layers_17_fc2_bias3, alloc925)
        R.vm.kill_object(alloc924)
        R.vm.kill_object(model_decoder_layers_17_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_17_fc2_bias3)
        gv1566: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1566, R.dtype("float16"))
        cls.add(alloc922, alloc925, alloc926)
        R.vm.kill_object(alloc922)
        R.vm.kill_object(alloc925)
        model_decoder_layers_18_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[928]
        model_decoder_layers_18_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[929]
        gv1567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1567, R.dtype("float16"))
        cls.layer_norm(alloc926, model_decoder_layers_18_self_attn_layer_norm_weight3, model_decoder_layers_18_self_attn_layer_norm_bias3, alloc927)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias3)
        model_decoder_layers_18_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[924]
        model_decoder_layers_18_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[925]
        gv1568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1568, R.dtype("float16"))
        _926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_q_proj_weight3, alloc927, model_decoder_layers_18_self_attn_q_proj_bias3, alloc928)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias3)
        gv1569: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape890: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc928, gv1569, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc928)
        model_decoder_layers_18_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[921]
        gv1570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1570, R.dtype("float16"))
        _927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_18_self_attn_k_proj_weight3, alloc927, alloc929)
        R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight3)
        gv1571: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape891: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc929, gv1571, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc929)
        model_decoder_layers_18_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[922]
        model_decoder_layers_18_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[923]
        gv1572: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1572, R.dtype("float16"))
        _928: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_v_proj_weight3, alloc927, model_decoder_layers_18_self_attn_v_proj_bias3, alloc930)
        R.vm.kill_object(alloc927)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias3)
        gv1573: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape892: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc930, gv1573, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc930)
        gv1574: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc931: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1574, R.dtype("float16"))
        cls.concatenate(reshape890, reshape891, reshape892, alloc931)
        R.vm.kill_object(reshape890)
        R.vm.kill_object(reshape891)
        R.vm.kill_object(reshape892)
        gv1575: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape893: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc931, gv1575, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc931)
        gv1576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1576, R.dtype("float16"))
        _930: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape893, alloc932)
        R.vm.kill_object(reshape893)
        gv1577: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape894: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc932, gv1577, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc932)
        gv1578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape895: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape894, gv1578, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape894)
        model_decoder_layers_18_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[926]
        model_decoder_layers_18_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[927]
        gv1579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1579, R.dtype("float16"))
        _931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_out_proj_weight3, reshape895, model_decoder_layers_18_self_attn_out_proj_bias3, alloc933)
        R.vm.kill_object(reshape895)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias3)
        gv1580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1580, R.dtype("float16"))
        cls.add(alloc926, alloc933, alloc934)
        R.vm.kill_object(alloc926)
        R.vm.kill_object(alloc933)
        model_decoder_layers_18_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[937]
        model_decoder_layers_18_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[938]
        gv1581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1581, R.dtype("float16"))
        cls.layer_norm(alloc934, model_decoder_layers_18_encoder_attn_layer_norm_weight3, model_decoder_layers_18_encoder_attn_layer_norm_bias3, alloc935)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias3)
        model_decoder_layers_18_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[933]
        model_decoder_layers_18_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[934]
        gv1582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1582, R.dtype("float16"))
        _934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight3, alloc935, model_decoder_layers_18_encoder_attn_q_proj_bias3, alloc936)
        R.vm.kill_object(alloc935)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias3)
        gv1583: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape896: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc936, gv1583, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc936)
        gv1584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape897: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape896, gv1584, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape896)
        gv1585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1585, R.dtype("float16"))
        _935: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape897, alloc937)
        R.vm.kill_object(reshape897)
        gv1586: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape898: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc937, gv1586, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc937)
        gv1587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape899: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape898, gv1587, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape898)
        model_decoder_layers_18_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[935]
        model_decoder_layers_18_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[936]
        gv1588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1588, R.dtype("float16"))
        _936: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight3, reshape899, model_decoder_layers_18_encoder_attn_out_proj_bias3, alloc938)
        R.vm.kill_object(reshape899)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias3)
        gv1589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1589, R.dtype("float16"))
        cls.add(alloc934, alloc938, alloc939)
        R.vm.kill_object(alloc934)
        R.vm.kill_object(alloc938)
        model_decoder_layers_18_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[943]
        model_decoder_layers_18_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[944]
        gv1590: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1590, R.dtype("float16"))
        cls.layer_norm(alloc939, model_decoder_layers_18_final_layer_norm_weight3, model_decoder_layers_18_final_layer_norm_bias3, alloc940)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias3)
        model_decoder_layers_18_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[939]
        model_decoder_layers_18_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[940]
        gv1591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1591, R.dtype("float16"))
        _939: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_18_fc1_weight3, alloc940, model_decoder_layers_18_fc1_bias3, alloc941)
        R.vm.kill_object(alloc940)
        R.vm.kill_object(model_decoder_layers_18_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_18_fc1_bias3)
        model_decoder_layers_18_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[941]
        model_decoder_layers_18_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[942]
        gv1592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1592, R.dtype("float16"))
        _940: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_18_fc2_weight3, alloc941, model_decoder_layers_18_fc2_bias3, alloc942)
        R.vm.kill_object(alloc941)
        R.vm.kill_object(model_decoder_layers_18_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_18_fc2_bias3)
        gv1593: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1593, R.dtype("float16"))
        cls.add(alloc939, alloc942, alloc943)
        R.vm.kill_object(alloc939)
        R.vm.kill_object(alloc942)
        model_decoder_layers_19_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[952]
        model_decoder_layers_19_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[953]
        gv1594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1594, R.dtype("float16"))
        cls.layer_norm(alloc943, model_decoder_layers_19_self_attn_layer_norm_weight3, model_decoder_layers_19_self_attn_layer_norm_bias3, alloc944)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias3)
        model_decoder_layers_19_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[948]
        model_decoder_layers_19_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[949]
        gv1595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1595, R.dtype("float16"))
        _943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_q_proj_weight3, alloc944, model_decoder_layers_19_self_attn_q_proj_bias3, alloc945)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias3)
        gv1596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape900: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc945, gv1596, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc945)
        model_decoder_layers_19_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[945]
        gv1597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1597, R.dtype("float16"))
        _944: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_19_self_attn_k_proj_weight3, alloc944, alloc946)
        R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight3)
        gv1598: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape901: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc946, gv1598, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc946)
        model_decoder_layers_19_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[946]
        model_decoder_layers_19_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[947]
        gv1599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1599, R.dtype("float16"))
        _945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_v_proj_weight3, alloc944, model_decoder_layers_19_self_attn_v_proj_bias3, alloc947)
        R.vm.kill_object(alloc944)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias3)
        gv1600: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape902: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc947, gv1600, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc947)
        gv1601: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc948: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1601, R.dtype("float16"))
        cls.concatenate(reshape900, reshape901, reshape902, alloc948)
        R.vm.kill_object(reshape900)
        R.vm.kill_object(reshape901)
        R.vm.kill_object(reshape902)
        gv1602: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape903: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc948, gv1602, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc948)
        gv1603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1603, R.dtype("float16"))
        _947: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape903, alloc949)
        R.vm.kill_object(reshape903)
        gv1604: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape904: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc949, gv1604, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc949)
        gv1605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape905: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape904, gv1605, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape904)
        model_decoder_layers_19_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[950]
        model_decoder_layers_19_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[951]
        gv1606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1606, R.dtype("float16"))
        _948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_out_proj_weight3, reshape905, model_decoder_layers_19_self_attn_out_proj_bias3, alloc950)
        R.vm.kill_object(reshape905)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias3)
        gv1607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1607, R.dtype("float16"))
        cls.add(alloc943, alloc950, alloc951)
        R.vm.kill_object(alloc943)
        R.vm.kill_object(alloc950)
        model_decoder_layers_19_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[961]
        model_decoder_layers_19_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[962]
        gv1608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1608, R.dtype("float16"))
        cls.layer_norm(alloc951, model_decoder_layers_19_encoder_attn_layer_norm_weight3, model_decoder_layers_19_encoder_attn_layer_norm_bias3, alloc952)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias3)
        model_decoder_layers_19_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[957]
        model_decoder_layers_19_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[958]
        gv1609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1609, R.dtype("float16"))
        _951: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight3, alloc952, model_decoder_layers_19_encoder_attn_q_proj_bias3, alloc953)
        R.vm.kill_object(alloc952)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias3)
        gv1610: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape906: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc953, gv1610, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc953)
        gv1611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape907: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape906, gv1611, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape906)
        gv1612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1612, R.dtype("float16"))
        _952: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape907, alloc954)
        R.vm.kill_object(reshape907)
        gv1613: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape908: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc954, gv1613, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc954)
        gv1614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape909: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape908, gv1614, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape908)
        model_decoder_layers_19_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[959]
        model_decoder_layers_19_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[960]
        gv1615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1615, R.dtype("float16"))
        _953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight3, reshape909, model_decoder_layers_19_encoder_attn_out_proj_bias3, alloc955)
        R.vm.kill_object(reshape909)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias3)
        gv1616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1616, R.dtype("float16"))
        cls.add(alloc951, alloc955, alloc956)
        R.vm.kill_object(alloc951)
        R.vm.kill_object(alloc955)
        model_decoder_layers_19_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[967]
        model_decoder_layers_19_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[968]
        gv1617: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1617, R.dtype("float16"))
        cls.layer_norm(alloc956, model_decoder_layers_19_final_layer_norm_weight3, model_decoder_layers_19_final_layer_norm_bias3, alloc957)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias3)
        model_decoder_layers_19_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[963]
        model_decoder_layers_19_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[964]
        gv1618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1618, R.dtype("float16"))
        _956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_19_fc1_weight3, alloc957, model_decoder_layers_19_fc1_bias3, alloc958)
        R.vm.kill_object(alloc957)
        R.vm.kill_object(model_decoder_layers_19_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_19_fc1_bias3)
        model_decoder_layers_19_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[965]
        model_decoder_layers_19_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[966]
        gv1619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1619, R.dtype("float16"))
        _957: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_19_fc2_weight3, alloc958, model_decoder_layers_19_fc2_bias3, alloc959)
        R.vm.kill_object(alloc958)
        R.vm.kill_object(model_decoder_layers_19_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_19_fc2_bias3)
        gv1620: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1620, R.dtype("float16"))
        cls.add(alloc956, alloc959, alloc960)
        R.vm.kill_object(alloc956)
        R.vm.kill_object(alloc959)
        model_decoder_layers_20_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[976]
        model_decoder_layers_20_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[977]
        gv1621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1621, R.dtype("float16"))
        cls.layer_norm(alloc960, model_decoder_layers_20_self_attn_layer_norm_weight3, model_decoder_layers_20_self_attn_layer_norm_bias3, alloc961)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias3)
        model_decoder_layers_20_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[972]
        model_decoder_layers_20_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[973]
        gv1622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1622, R.dtype("float16"))
        _960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_q_proj_weight3, alloc961, model_decoder_layers_20_self_attn_q_proj_bias3, alloc962)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias3)
        gv1623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape910: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc962, gv1623, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc962)
        model_decoder_layers_20_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[969]
        gv1624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1624, R.dtype("float16"))
        _961: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_20_self_attn_k_proj_weight3, alloc961, alloc963)
        R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight3)
        gv1625: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape911: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc963, gv1625, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc963)
        model_decoder_layers_20_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[970]
        model_decoder_layers_20_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[971]
        gv1626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1626, R.dtype("float16"))
        _962: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_v_proj_weight3, alloc961, model_decoder_layers_20_self_attn_v_proj_bias3, alloc964)
        R.vm.kill_object(alloc961)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias3)
        gv1627: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape912: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc964, gv1627, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc964)
        gv1628: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc965: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1628, R.dtype("float16"))
        cls.concatenate(reshape910, reshape911, reshape912, alloc965)
        R.vm.kill_object(reshape910)
        R.vm.kill_object(reshape911)
        R.vm.kill_object(reshape912)
        gv1629: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape913: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc965, gv1629, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc965)
        gv1630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1630, R.dtype("float16"))
        _964: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape913, alloc966)
        R.vm.kill_object(reshape913)
        gv1631: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape914: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc966, gv1631, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc966)
        gv1632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape915: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape914, gv1632, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape914)
        model_decoder_layers_20_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[974]
        model_decoder_layers_20_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[975]
        gv1633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1633, R.dtype("float16"))
        _965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_out_proj_weight3, reshape915, model_decoder_layers_20_self_attn_out_proj_bias3, alloc967)
        R.vm.kill_object(reshape915)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias3)
        gv1634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1634, R.dtype("float16"))
        cls.add(alloc960, alloc967, alloc968)
        R.vm.kill_object(alloc960)
        R.vm.kill_object(alloc967)
        model_decoder_layers_20_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[985]
        model_decoder_layers_20_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[986]
        gv1635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1635, R.dtype("float16"))
        cls.layer_norm(alloc968, model_decoder_layers_20_encoder_attn_layer_norm_weight3, model_decoder_layers_20_encoder_attn_layer_norm_bias3, alloc969)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias3)
        model_decoder_layers_20_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[981]
        model_decoder_layers_20_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[982]
        gv1636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1636, R.dtype("float16"))
        _968: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight3, alloc969, model_decoder_layers_20_encoder_attn_q_proj_bias3, alloc970)
        R.vm.kill_object(alloc969)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias3)
        gv1637: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape916: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc970, gv1637, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc970)
        gv1638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape917: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape916, gv1638, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape916)
        gv1639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1639, R.dtype("float16"))
        _969: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape917, alloc971)
        R.vm.kill_object(reshape917)
        gv1640: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape918: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc971, gv1640, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc971)
        gv1641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape919: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape918, gv1641, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape918)
        model_decoder_layers_20_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[983]
        model_decoder_layers_20_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[984]
        gv1642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1642, R.dtype("float16"))
        _970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight3, reshape919, model_decoder_layers_20_encoder_attn_out_proj_bias3, alloc972)
        R.vm.kill_object(reshape919)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias3)
        gv1643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1643, R.dtype("float16"))
        cls.add(alloc968, alloc972, alloc973)
        R.vm.kill_object(alloc968)
        R.vm.kill_object(alloc972)
        model_decoder_layers_20_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[991]
        model_decoder_layers_20_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[992]
        gv1644: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1644, R.dtype("float16"))
        cls.layer_norm(alloc973, model_decoder_layers_20_final_layer_norm_weight3, model_decoder_layers_20_final_layer_norm_bias3, alloc974)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias3)
        model_decoder_layers_20_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[987]
        model_decoder_layers_20_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[988]
        gv1645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1645, R.dtype("float16"))
        _973: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_20_fc1_weight3, alloc974, model_decoder_layers_20_fc1_bias3, alloc975)
        R.vm.kill_object(alloc974)
        R.vm.kill_object(model_decoder_layers_20_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_20_fc1_bias3)
        model_decoder_layers_20_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[989]
        model_decoder_layers_20_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[990]
        gv1646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc976: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1646, R.dtype("float16"))
        _974: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_20_fc2_weight3, alloc975, model_decoder_layers_20_fc2_bias3, alloc976)
        R.vm.kill_object(alloc975)
        R.vm.kill_object(model_decoder_layers_20_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_20_fc2_bias3)
        gv1647: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc977: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1647, R.dtype("float16"))
        cls.add(alloc973, alloc976, alloc977)
        R.vm.kill_object(alloc973)
        R.vm.kill_object(alloc976)
        model_decoder_layers_21_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1000]
        model_decoder_layers_21_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1001]
        gv1648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc978: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1648, R.dtype("float16"))
        cls.layer_norm(alloc977, model_decoder_layers_21_self_attn_layer_norm_weight3, model_decoder_layers_21_self_attn_layer_norm_bias3, alloc978)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias3)
        model_decoder_layers_21_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[996]
        model_decoder_layers_21_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[997]
        gv1649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc979: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1649, R.dtype("float16"))
        _977: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_q_proj_weight3, alloc978, model_decoder_layers_21_self_attn_q_proj_bias3, alloc979)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias3)
        gv1650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape920: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc979, gv1650, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc979)
        model_decoder_layers_21_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[993]
        gv1651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc980: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1651, R.dtype("float16"))
        _978: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_21_self_attn_k_proj_weight3, alloc978, alloc980)
        R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight3)
        gv1652: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape921: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc980, gv1652, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc980)
        model_decoder_layers_21_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[994]
        model_decoder_layers_21_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[995]
        gv1653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc981: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1653, R.dtype("float16"))
        _979: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_v_proj_weight3, alloc978, model_decoder_layers_21_self_attn_v_proj_bias3, alloc981)
        R.vm.kill_object(alloc978)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias3)
        gv1654: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape922: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc981, gv1654, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc981)
        gv1655: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc982: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1655, R.dtype("float16"))
        cls.concatenate(reshape920, reshape921, reshape922, alloc982)
        R.vm.kill_object(reshape920)
        R.vm.kill_object(reshape921)
        R.vm.kill_object(reshape922)
        gv1656: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape923: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc982, gv1656, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc982)
        gv1657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc983: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1657, R.dtype("float16"))
        _981: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape923, alloc983)
        R.vm.kill_object(reshape923)
        gv1658: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape924: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc983, gv1658, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc983)
        gv1659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape925: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape924, gv1659, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape924)
        model_decoder_layers_21_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[998]
        model_decoder_layers_21_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[999]
        gv1660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1660, R.dtype("float16"))
        _982: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_out_proj_weight3, reshape925, model_decoder_layers_21_self_attn_out_proj_bias3, alloc984)
        R.vm.kill_object(reshape925)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias3)
        gv1661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1661, R.dtype("float16"))
        cls.add(alloc977, alloc984, alloc985)
        R.vm.kill_object(alloc977)
        R.vm.kill_object(alloc984)
        model_decoder_layers_21_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1009]
        model_decoder_layers_21_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1010]
        gv1662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1662, R.dtype("float16"))
        cls.layer_norm(alloc985, model_decoder_layers_21_encoder_attn_layer_norm_weight3, model_decoder_layers_21_encoder_attn_layer_norm_bias3, alloc986)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias3)
        model_decoder_layers_21_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005]
        model_decoder_layers_21_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1006]
        gv1663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1663, R.dtype("float16"))
        _985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight3, alloc986, model_decoder_layers_21_encoder_attn_q_proj_bias3, alloc987)
        R.vm.kill_object(alloc986)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias3)
        gv1664: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape926: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc987, gv1664, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc987)
        gv1665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape927: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape926, gv1665, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape926)
        gv1666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1666, R.dtype("float16"))
        _986: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape927, alloc988)
        R.vm.kill_object(reshape927)
        gv1667: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape928: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc988, gv1667, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc988)
        gv1668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape929: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape928, gv1668, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape928)
        model_decoder_layers_21_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007]
        model_decoder_layers_21_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1008]
        gv1669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc989: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1669, R.dtype("float16"))
        _987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight3, reshape929, model_decoder_layers_21_encoder_attn_out_proj_bias3, alloc989)
        R.vm.kill_object(reshape929)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias3)
        gv1670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1670, R.dtype("float16"))
        cls.add(alloc985, alloc989, alloc990)
        R.vm.kill_object(alloc985)
        R.vm.kill_object(alloc989)
        model_decoder_layers_21_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1015]
        model_decoder_layers_21_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1016]
        gv1671: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1671, R.dtype("float16"))
        cls.layer_norm(alloc990, model_decoder_layers_21_final_layer_norm_weight3, model_decoder_layers_21_final_layer_norm_bias3, alloc991)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias3)
        model_decoder_layers_21_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011]
        model_decoder_layers_21_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1012]
        gv1672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1672, R.dtype("float16"))
        _990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_21_fc1_weight3, alloc991, model_decoder_layers_21_fc1_bias3, alloc992)
        R.vm.kill_object(alloc991)
        R.vm.kill_object(model_decoder_layers_21_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_21_fc1_bias3)
        model_decoder_layers_21_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013]
        model_decoder_layers_21_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1014]
        gv1673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1673, R.dtype("float16"))
        _991: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_21_fc2_weight3, alloc992, model_decoder_layers_21_fc2_bias3, alloc993)
        R.vm.kill_object(alloc992)
        R.vm.kill_object(model_decoder_layers_21_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_21_fc2_bias3)
        gv1674: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1674, R.dtype("float16"))
        cls.add(alloc990, alloc993, alloc994)
        R.vm.kill_object(alloc990)
        R.vm.kill_object(alloc993)
        model_decoder_layers_22_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1024]
        model_decoder_layers_22_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1025]
        gv1675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1675, R.dtype("float16"))
        cls.layer_norm(alloc994, model_decoder_layers_22_self_attn_layer_norm_weight3, model_decoder_layers_22_self_attn_layer_norm_bias3, alloc995)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias3)
        model_decoder_layers_22_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020]
        model_decoder_layers_22_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1021]
        gv1676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1676, R.dtype("float16"))
        _994: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_q_proj_weight3, alloc995, model_decoder_layers_22_self_attn_q_proj_bias3, alloc996)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias3)
        gv1677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape930: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc996, gv1677, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc996)
        model_decoder_layers_22_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017]
        gv1678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1678, R.dtype("float16"))
        _995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_22_self_attn_k_proj_weight3, alloc995, alloc997)
        R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight3)
        gv1679: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape931: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc997, gv1679, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc997)
        model_decoder_layers_22_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018]
        model_decoder_layers_22_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1019]
        gv1680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1680, R.dtype("float16"))
        _996: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_v_proj_weight3, alloc995, model_decoder_layers_22_self_attn_v_proj_bias3, alloc998)
        R.vm.kill_object(alloc995)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias3)
        gv1681: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape932: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc998, gv1681, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc998)
        gv1682: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc999: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1682, R.dtype("float16"))
        cls.concatenate(reshape930, reshape931, reshape932, alloc999)
        R.vm.kill_object(reshape930)
        R.vm.kill_object(reshape931)
        R.vm.kill_object(reshape932)
        gv1683: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape933: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc999, gv1683, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc999)
        gv1684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1684, R.dtype("float16"))
        _998: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape933, alloc1000)
        R.vm.kill_object(reshape933)
        gv1685: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape934: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1000, gv1685, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1000)
        gv1686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape935: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape934, gv1686, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape934)
        model_decoder_layers_22_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022]
        model_decoder_layers_22_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1023]
        gv1687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1687, R.dtype("float16"))
        _999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_out_proj_weight3, reshape935, model_decoder_layers_22_self_attn_out_proj_bias3, alloc1001)
        R.vm.kill_object(reshape935)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias3)
        gv1688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1688, R.dtype("float16"))
        cls.add(alloc994, alloc1001, alloc1002)
        R.vm.kill_object(alloc994)
        R.vm.kill_object(alloc1001)
        model_decoder_layers_22_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1033]
        model_decoder_layers_22_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1034]
        gv1689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1689, R.dtype("float16"))
        cls.layer_norm(alloc1002, model_decoder_layers_22_encoder_attn_layer_norm_weight3, model_decoder_layers_22_encoder_attn_layer_norm_bias3, alloc1003)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias3)
        model_decoder_layers_22_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029]
        model_decoder_layers_22_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1030]
        gv1690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1690, R.dtype("float16"))
        _1002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight3, alloc1003, model_decoder_layers_22_encoder_attn_q_proj_bias3, alloc1004)
        R.vm.kill_object(alloc1003)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias3)
        gv1691: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape936: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1004, gv1691, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1004)
        gv1692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape937: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape936, gv1692, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape936)
        gv1693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1693, R.dtype("float16"))
        _1003: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape937, alloc1005)
        R.vm.kill_object(reshape937)
        gv1694: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape938: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1005, gv1694, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1005)
        gv1695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape939: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape938, gv1695, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape938)
        model_decoder_layers_22_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031]
        model_decoder_layers_22_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1032]
        gv1696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1006: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1696, R.dtype("float16"))
        _1004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight3, reshape939, model_decoder_layers_22_encoder_attn_out_proj_bias3, alloc1006)
        R.vm.kill_object(reshape939)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias3)
        gv1697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1697, R.dtype("float16"))
        cls.add(alloc1002, alloc1006, alloc1007)
        R.vm.kill_object(alloc1002)
        R.vm.kill_object(alloc1006)
        model_decoder_layers_22_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1039]
        model_decoder_layers_22_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1040]
        gv1698: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1698, R.dtype("float16"))
        cls.layer_norm(alloc1007, model_decoder_layers_22_final_layer_norm_weight3, model_decoder_layers_22_final_layer_norm_bias3, alloc1008)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias3)
        model_decoder_layers_22_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035]
        model_decoder_layers_22_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1036]
        gv1699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1699, R.dtype("float16"))
        _1007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_22_fc1_weight3, alloc1008, model_decoder_layers_22_fc1_bias3, alloc1009)
        R.vm.kill_object(alloc1008)
        R.vm.kill_object(model_decoder_layers_22_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_22_fc1_bias3)
        model_decoder_layers_22_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037]
        model_decoder_layers_22_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1038]
        gv1700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1700, R.dtype("float16"))
        _1008: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_22_fc2_weight3, alloc1009, model_decoder_layers_22_fc2_bias3, alloc1010)
        R.vm.kill_object(alloc1009)
        R.vm.kill_object(model_decoder_layers_22_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_22_fc2_bias3)
        gv1701: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1701, R.dtype("float16"))
        cls.add(alloc1007, alloc1010, alloc1011)
        R.vm.kill_object(alloc1007)
        R.vm.kill_object(alloc1010)
        model_decoder_layers_23_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1048]
        model_decoder_layers_23_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1049]
        gv1702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1702, R.dtype("float16"))
        cls.layer_norm(alloc1011, model_decoder_layers_23_self_attn_layer_norm_weight3, model_decoder_layers_23_self_attn_layer_norm_bias3, alloc1012)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias3)
        model_decoder_layers_23_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044]
        model_decoder_layers_23_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1045]
        gv1703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1703, R.dtype("float16"))
        _1011: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_q_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_q_proj_bias3, alloc1013)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias3)
        gv1704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape940: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1013, gv1704, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1013)
        model_decoder_layers_23_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041]
        gv1705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1705, R.dtype("float16"))
        _1012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_23_self_attn_k_proj_weight3, alloc1012, alloc1014)
        R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight3)
        gv1706: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape941: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1014, gv1706, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1014)
        model_decoder_layers_23_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042]
        model_decoder_layers_23_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1043]
        gv1707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1707, R.dtype("float16"))
        _1013: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_v_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_v_proj_bias3, alloc1015)
        R.vm.kill_object(alloc1012)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias3)
        gv1708: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape942: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1015, gv1708, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1015)
        gv1709: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1016: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1709, R.dtype("float16"))
        cls.concatenate(reshape940, reshape941, reshape942, alloc1016)
        R.vm.kill_object(reshape940)
        R.vm.kill_object(reshape941)
        R.vm.kill_object(reshape942)
        gv1710: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape943: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1016, gv1710, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1016)
        gv1711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1711, R.dtype("float16"))
        _1015: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape943, alloc1017)
        R.vm.kill_object(reshape943)
        gv1712: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape944: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1017, gv1712, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1017)
        gv1713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape945: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape944, gv1713, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape944)
        model_decoder_layers_23_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046]
        model_decoder_layers_23_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1047]
        gv1714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1714, R.dtype("float16"))
        _1016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_out_proj_weight3, reshape945, model_decoder_layers_23_self_attn_out_proj_bias3, alloc1018)
        R.vm.kill_object(reshape945)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias3)
        gv1715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1715, R.dtype("float16"))
        cls.add(alloc1011, alloc1018, alloc1019)
        R.vm.kill_object(alloc1011)
        R.vm.kill_object(alloc1018)
        model_decoder_layers_23_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1057]
        model_decoder_layers_23_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1058]
        gv1716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1716, R.dtype("float16"))
        cls.layer_norm(alloc1019, model_decoder_layers_23_encoder_attn_layer_norm_weight3, model_decoder_layers_23_encoder_attn_layer_norm_bias3, alloc1020)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias3)
        model_decoder_layers_23_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053]
        model_decoder_layers_23_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1054]
        gv1717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1717, R.dtype("float16"))
        _1019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight3, alloc1020, model_decoder_layers_23_encoder_attn_q_proj_bias3, alloc1021)
        R.vm.kill_object(alloc1020)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias3)
        gv1718: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape946: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1021, gv1718, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1021)
        gv1719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape947: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape946, gv1719, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape946)
        gv1720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1720, R.dtype("float16"))
        _1020: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape947, alloc1022)
        R.vm.kill_object(reshape947)
        gv1721: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape948: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1022, gv1721, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1022)
        gv1722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape949: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape948, gv1722, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape948)
        model_decoder_layers_23_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055]
        model_decoder_layers_23_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1056]
        gv1723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1023: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1723, R.dtype("float16"))
        _1021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight3, reshape949, model_decoder_layers_23_encoder_attn_out_proj_bias3, alloc1023)
        R.vm.kill_object(reshape949)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias3)
        gv1724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1724, R.dtype("float16"))
        cls.add(alloc1019, alloc1023, alloc1024)
        R.vm.kill_object(alloc1019)
        R.vm.kill_object(alloc1023)
        model_decoder_layers_23_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1063]
        model_decoder_layers_23_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1064]
        gv1725: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1725, R.dtype("float16"))
        cls.layer_norm(alloc1024, model_decoder_layers_23_final_layer_norm_weight3, model_decoder_layers_23_final_layer_norm_bias3, alloc1025)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias3)
        model_decoder_layers_23_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059]
        model_decoder_layers_23_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1060]
        gv1726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1726, R.dtype("float16"))
        _1024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_23_fc1_weight3, alloc1025, model_decoder_layers_23_fc1_bias3, alloc1026)
        R.vm.kill_object(alloc1025)
        R.vm.kill_object(model_decoder_layers_23_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_23_fc1_bias3)
        model_decoder_layers_23_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061]
        model_decoder_layers_23_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1062]
        gv1727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1727, R.dtype("float16"))
        _1025: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_23_fc2_weight3, alloc1026, model_decoder_layers_23_fc2_bias3, alloc1027)
        R.vm.kill_object(alloc1026)
        R.vm.kill_object(model_decoder_layers_23_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_23_fc2_bias3)
        gv1728: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1728, R.dtype("float16"))
        cls.add(alloc1024, alloc1027, alloc1028)
        R.vm.kill_object(alloc1024)
        R.vm.kill_object(alloc1027)
        model_decoder_layers_24_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1072]
        model_decoder_layers_24_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1073]
        gv1729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1729, R.dtype("float16"))
        cls.layer_norm(alloc1028, model_decoder_layers_24_self_attn_layer_norm_weight3, model_decoder_layers_24_self_attn_layer_norm_bias3, alloc1029)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias3)
        model_decoder_layers_24_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068]
        model_decoder_layers_24_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1069]
        gv1730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1730, R.dtype("float16"))
        _1028: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_q_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_q_proj_bias3, alloc1030)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias3)
        gv1731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape950: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1030, gv1731, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1030)
        model_decoder_layers_24_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065]
        gv1732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1732, R.dtype("float16"))
        _1029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_24_self_attn_k_proj_weight3, alloc1029, alloc1031)
        R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight3)
        gv1733: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape951: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1031, gv1733, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1031)
        model_decoder_layers_24_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066]
        model_decoder_layers_24_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1067]
        gv1734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1734, R.dtype("float16"))
        _1030: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_v_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_v_proj_bias3, alloc1032)
        R.vm.kill_object(alloc1029)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias3)
        gv1735: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape952: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1032, gv1735, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1032)
        gv1736: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1033: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1736, R.dtype("float16"))
        cls.concatenate(reshape950, reshape951, reshape952, alloc1033)
        R.vm.kill_object(reshape950)
        R.vm.kill_object(reshape951)
        R.vm.kill_object(reshape952)
        gv1737: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape953: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1033, gv1737, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1033)
        gv1738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1738, R.dtype("float16"))
        _1032: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape953, alloc1034)
        R.vm.kill_object(reshape953)
        gv1739: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape954: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1034, gv1739, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1034)
        gv1740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape955: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape954, gv1740, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape954)
        model_decoder_layers_24_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070]
        model_decoder_layers_24_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1071]
        gv1741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1741, R.dtype("float16"))
        _1033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_out_proj_weight3, reshape955, model_decoder_layers_24_self_attn_out_proj_bias3, alloc1035)
        R.vm.kill_object(reshape955)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias3)
        gv1742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1742, R.dtype("float16"))
        cls.add(alloc1028, alloc1035, alloc1036)
        R.vm.kill_object(alloc1028)
        R.vm.kill_object(alloc1035)
        model_decoder_layers_24_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1081]
        model_decoder_layers_24_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1082]
        gv1743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1743, R.dtype("float16"))
        cls.layer_norm(alloc1036, model_decoder_layers_24_encoder_attn_layer_norm_weight3, model_decoder_layers_24_encoder_attn_layer_norm_bias3, alloc1037)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias3)
        model_decoder_layers_24_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077]
        model_decoder_layers_24_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1078]
        gv1744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1744, R.dtype("float16"))
        _1036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight3, alloc1037, model_decoder_layers_24_encoder_attn_q_proj_bias3, alloc1038)
        R.vm.kill_object(alloc1037)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias3)
        gv1745: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape956: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1038, gv1745, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1038)
        gv1746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape957: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape956, gv1746, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape956)
        gv1747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1747, R.dtype("float16"))
        _1037: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape957, alloc1039)
        R.vm.kill_object(reshape957)
        gv1748: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape958: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1039, gv1748, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1039)
        gv1749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape959: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape958, gv1749, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape958)
        model_decoder_layers_24_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079]
        model_decoder_layers_24_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1080]
        gv1750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1040: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1750, R.dtype("float16"))
        _1038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight3, reshape959, model_decoder_layers_24_encoder_attn_out_proj_bias3, alloc1040)
        R.vm.kill_object(reshape959)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias3)
        gv1751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1751, R.dtype("float16"))
        cls.add(alloc1036, alloc1040, alloc1041)
        R.vm.kill_object(alloc1036)
        R.vm.kill_object(alloc1040)
        model_decoder_layers_24_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1087]
        model_decoder_layers_24_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1088]
        gv1752: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1752, R.dtype("float16"))
        cls.layer_norm(alloc1041, model_decoder_layers_24_final_layer_norm_weight3, model_decoder_layers_24_final_layer_norm_bias3, alloc1042)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias3)
        model_decoder_layers_24_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083]
        model_decoder_layers_24_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1084]
        gv1753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1753, R.dtype("float16"))
        _1041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_24_fc1_weight3, alloc1042, model_decoder_layers_24_fc1_bias3, alloc1043)
        R.vm.kill_object(alloc1042)
        R.vm.kill_object(model_decoder_layers_24_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_24_fc1_bias3)
        model_decoder_layers_24_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085]
        model_decoder_layers_24_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1086]
        gv1754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1754, R.dtype("float16"))
        _1042: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_24_fc2_weight3, alloc1043, model_decoder_layers_24_fc2_bias3, alloc1044)
        R.vm.kill_object(alloc1043)
        R.vm.kill_object(model_decoder_layers_24_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_24_fc2_bias3)
        gv1755: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1755, R.dtype("float16"))
        cls.add(alloc1041, alloc1044, alloc1045)
        R.vm.kill_object(alloc1041)
        R.vm.kill_object(alloc1044)
        model_decoder_layers_25_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1096]
        model_decoder_layers_25_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1097]
        gv1756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1756, R.dtype("float16"))
        cls.layer_norm(alloc1045, model_decoder_layers_25_self_attn_layer_norm_weight3, model_decoder_layers_25_self_attn_layer_norm_bias3, alloc1046)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias3)
        model_decoder_layers_25_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092]
        model_decoder_layers_25_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1093]
        gv1757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1757, R.dtype("float16"))
        _1045: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_q_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_q_proj_bias3, alloc1047)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias3)
        gv1758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape960: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1047, gv1758, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1047)
        model_decoder_layers_25_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089]
        gv1759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1759, R.dtype("float16"))
        _1046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_25_self_attn_k_proj_weight3, alloc1046, alloc1048)
        R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight3)
        gv1760: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape961: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1048, gv1760, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1048)
        model_decoder_layers_25_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090]
        model_decoder_layers_25_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1091]
        gv1761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1761, R.dtype("float16"))
        _1047: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_v_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_v_proj_bias3, alloc1049)
        R.vm.kill_object(alloc1046)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias3)
        gv1762: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape962: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1049, gv1762, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1049)
        gv1763: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1050: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1763, R.dtype("float16"))
        cls.concatenate(reshape960, reshape961, reshape962, alloc1050)
        R.vm.kill_object(reshape960)
        R.vm.kill_object(reshape961)
        R.vm.kill_object(reshape962)
        gv1764: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape963: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1050, gv1764, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1050)
        gv1765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1765, R.dtype("float16"))
        _1049: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape963, alloc1051)
        R.vm.kill_object(reshape963)
        gv1766: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape964: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1051, gv1766, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1051)
        gv1767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape965: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape964, gv1767, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape964)
        model_decoder_layers_25_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094]
        model_decoder_layers_25_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1095]
        gv1768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1768, R.dtype("float16"))
        _1050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_out_proj_weight3, reshape965, model_decoder_layers_25_self_attn_out_proj_bias3, alloc1052)
        R.vm.kill_object(reshape965)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias3)
        gv1769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1769, R.dtype("float16"))
        cls.add(alloc1045, alloc1052, alloc1053)
        R.vm.kill_object(alloc1045)
        R.vm.kill_object(alloc1052)
        model_decoder_layers_25_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1105]
        model_decoder_layers_25_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1106]
        gv1770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1770, R.dtype("float16"))
        cls.layer_norm(alloc1053, model_decoder_layers_25_encoder_attn_layer_norm_weight3, model_decoder_layers_25_encoder_attn_layer_norm_bias3, alloc1054)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias3)
        model_decoder_layers_25_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101]
        model_decoder_layers_25_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1102]
        gv1771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1771, R.dtype("float16"))
        _1053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight3, alloc1054, model_decoder_layers_25_encoder_attn_q_proj_bias3, alloc1055)
        R.vm.kill_object(alloc1054)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias3)
        gv1772: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape966: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1055, gv1772, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1055)
        gv1773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape967: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape966, gv1773, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape966)
        gv1774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1774, R.dtype("float16"))
        _1054: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape967, alloc1056)
        R.vm.kill_object(reshape967)
        gv1775: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape968: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1056, gv1775, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1056)
        gv1776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape969: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape968, gv1776, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape968)
        model_decoder_layers_25_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103]
        model_decoder_layers_25_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1104]
        gv1777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1057: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1777, R.dtype("float16"))
        _1055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight3, reshape969, model_decoder_layers_25_encoder_attn_out_proj_bias3, alloc1057)
        R.vm.kill_object(reshape969)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias3)
        gv1778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1778, R.dtype("float16"))
        cls.add(alloc1053, alloc1057, alloc1058)
        R.vm.kill_object(alloc1053)
        R.vm.kill_object(alloc1057)
        model_decoder_layers_25_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1111]
        model_decoder_layers_25_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1112]
        gv1779: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1779, R.dtype("float16"))
        cls.layer_norm(alloc1058, model_decoder_layers_25_final_layer_norm_weight3, model_decoder_layers_25_final_layer_norm_bias3, alloc1059)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias3)
        model_decoder_layers_25_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107]
        model_decoder_layers_25_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1108]
        gv1780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1780, R.dtype("float16"))
        _1058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_25_fc1_weight3, alloc1059, model_decoder_layers_25_fc1_bias3, alloc1060)
        R.vm.kill_object(alloc1059)
        R.vm.kill_object(model_decoder_layers_25_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_25_fc1_bias3)
        model_decoder_layers_25_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109]
        model_decoder_layers_25_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1110]
        gv1781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1781, R.dtype("float16"))
        _1059: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_25_fc2_weight3, alloc1060, model_decoder_layers_25_fc2_bias3, alloc1061)
        R.vm.kill_object(alloc1060)
        R.vm.kill_object(model_decoder_layers_25_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_25_fc2_bias3)
        gv1782: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1782, R.dtype("float16"))
        cls.add(alloc1058, alloc1061, alloc1062)
        R.vm.kill_object(alloc1058)
        R.vm.kill_object(alloc1061)
        model_decoder_layers_26_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1120]
        model_decoder_layers_26_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1121]
        gv1783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1783, R.dtype("float16"))
        cls.layer_norm(alloc1062, model_decoder_layers_26_self_attn_layer_norm_weight3, model_decoder_layers_26_self_attn_layer_norm_bias3, alloc1063)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias3)
        model_decoder_layers_26_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116]
        model_decoder_layers_26_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1117]
        gv1784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1784, R.dtype("float16"))
        _1062: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_q_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_q_proj_bias3, alloc1064)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias3)
        gv1785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape970: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1064, gv1785, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1064)
        model_decoder_layers_26_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113]
        gv1786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1786, R.dtype("float16"))
        _1063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_26_self_attn_k_proj_weight3, alloc1063, alloc1065)
        R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight3)
        gv1787: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape971: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1065, gv1787, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1065)
        model_decoder_layers_26_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114]
        model_decoder_layers_26_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1115]
        gv1788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1788, R.dtype("float16"))
        _1064: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_v_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_v_proj_bias3, alloc1066)
        R.vm.kill_object(alloc1063)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias3)
        gv1789: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape972: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1066, gv1789, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1066)
        gv1790: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1067: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1790, R.dtype("float16"))
        cls.concatenate(reshape970, reshape971, reshape972, alloc1067)
        R.vm.kill_object(reshape970)
        R.vm.kill_object(reshape971)
        R.vm.kill_object(reshape972)
        gv1791: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape973: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1067, gv1791, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1067)
        gv1792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1792, R.dtype("float16"))
        _1066: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape973, alloc1068)
        R.vm.kill_object(reshape973)
        gv1793: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape974: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1068, gv1793, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1068)
        gv1794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape975: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape974, gv1794, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape974)
        model_decoder_layers_26_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118]
        model_decoder_layers_26_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1119]
        gv1795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1795, R.dtype("float16"))
        _1067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_out_proj_weight3, reshape975, model_decoder_layers_26_self_attn_out_proj_bias3, alloc1069)
        R.vm.kill_object(reshape975)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias3)
        gv1796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1796, R.dtype("float16"))
        cls.add(alloc1062, alloc1069, alloc1070)
        R.vm.kill_object(alloc1062)
        R.vm.kill_object(alloc1069)
        model_decoder_layers_26_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1129]
        model_decoder_layers_26_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1130]
        gv1797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1797, R.dtype("float16"))
        cls.layer_norm(alloc1070, model_decoder_layers_26_encoder_attn_layer_norm_weight3, model_decoder_layers_26_encoder_attn_layer_norm_bias3, alloc1071)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias3)
        model_decoder_layers_26_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125]
        model_decoder_layers_26_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1126]
        gv1798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1798, R.dtype("float16"))
        _1070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight3, alloc1071, model_decoder_layers_26_encoder_attn_q_proj_bias3, alloc1072)
        R.vm.kill_object(alloc1071)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias3)
        gv1799: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape976: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1072, gv1799, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1072)
        gv1800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape977: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape976, gv1800, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape976)
        gv1801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1801, R.dtype("float16"))
        _1071: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape977, alloc1073)
        R.vm.kill_object(reshape977)
        gv1802: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape978: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1073, gv1802, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1073)
        gv1803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape979: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape978, gv1803, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape978)
        model_decoder_layers_26_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127]
        model_decoder_layers_26_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1128]
        gv1804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1074: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1804, R.dtype("float16"))
        _1072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight3, reshape979, model_decoder_layers_26_encoder_attn_out_proj_bias3, alloc1074)
        R.vm.kill_object(reshape979)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias3)
        gv1805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1805, R.dtype("float16"))
        cls.add(alloc1070, alloc1074, alloc1075)
        R.vm.kill_object(alloc1070)
        R.vm.kill_object(alloc1074)
        model_decoder_layers_26_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1135]
        model_decoder_layers_26_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1136]
        gv1806: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1806, R.dtype("float16"))
        cls.layer_norm(alloc1075, model_decoder_layers_26_final_layer_norm_weight3, model_decoder_layers_26_final_layer_norm_bias3, alloc1076)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias3)
        model_decoder_layers_26_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131]
        model_decoder_layers_26_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1132]
        gv1807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1807, R.dtype("float16"))
        _1075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_26_fc1_weight3, alloc1076, model_decoder_layers_26_fc1_bias3, alloc1077)
        R.vm.kill_object(alloc1076)
        R.vm.kill_object(model_decoder_layers_26_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_26_fc1_bias3)
        model_decoder_layers_26_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133]
        model_decoder_layers_26_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1134]
        gv1808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1808, R.dtype("float16"))
        _1076: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_26_fc2_weight3, alloc1077, model_decoder_layers_26_fc2_bias3, alloc1078)
        R.vm.kill_object(alloc1077)
        R.vm.kill_object(model_decoder_layers_26_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_26_fc2_bias3)
        gv1809: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1809, R.dtype("float16"))
        cls.add(alloc1075, alloc1078, alloc1079)
        R.vm.kill_object(alloc1075)
        R.vm.kill_object(alloc1078)
        model_decoder_layers_27_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1144]
        model_decoder_layers_27_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1145]
        gv1810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1810, R.dtype("float16"))
        cls.layer_norm(alloc1079, model_decoder_layers_27_self_attn_layer_norm_weight3, model_decoder_layers_27_self_attn_layer_norm_bias3, alloc1080)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias3)
        model_decoder_layers_27_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140]
        model_decoder_layers_27_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1141]
        gv1811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1811, R.dtype("float16"))
        _1079: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_q_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_q_proj_bias3, alloc1081)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias3)
        gv1812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape980: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1081, gv1812, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1081)
        model_decoder_layers_27_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137]
        gv1813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1813, R.dtype("float16"))
        _1080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_27_self_attn_k_proj_weight3, alloc1080, alloc1082)
        R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight3)
        gv1814: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape981: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1082, gv1814, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1082)
        model_decoder_layers_27_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138]
        model_decoder_layers_27_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1139]
        gv1815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1815, R.dtype("float16"))
        _1081: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_v_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_v_proj_bias3, alloc1083)
        R.vm.kill_object(alloc1080)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias3)
        gv1816: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape982: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1083, gv1816, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1083)
        gv1817: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1084: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1817, R.dtype("float16"))
        cls.concatenate(reshape980, reshape981, reshape982, alloc1084)
        R.vm.kill_object(reshape980)
        R.vm.kill_object(reshape981)
        R.vm.kill_object(reshape982)
        gv1818: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape983: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1084, gv1818, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1084)
        gv1819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1819, R.dtype("float16"))
        _1083: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape983, alloc1085)
        R.vm.kill_object(reshape983)
        gv1820: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape984: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1085, gv1820, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1085)
        gv1821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape985: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape984, gv1821, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape984)
        model_decoder_layers_27_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142]
        model_decoder_layers_27_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1143]
        gv1822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1822, R.dtype("float16"))
        _1084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_out_proj_weight3, reshape985, model_decoder_layers_27_self_attn_out_proj_bias3, alloc1086)
        R.vm.kill_object(reshape985)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias3)
        gv1823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1823, R.dtype("float16"))
        cls.add(alloc1079, alloc1086, alloc1087)
        R.vm.kill_object(alloc1079)
        R.vm.kill_object(alloc1086)
        model_decoder_layers_27_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1153]
        model_decoder_layers_27_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1154]
        gv1824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1824, R.dtype("float16"))
        cls.layer_norm(alloc1087, model_decoder_layers_27_encoder_attn_layer_norm_weight3, model_decoder_layers_27_encoder_attn_layer_norm_bias3, alloc1088)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias3)
        model_decoder_layers_27_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149]
        model_decoder_layers_27_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1150]
        gv1825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1825, R.dtype("float16"))
        _1087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight3, alloc1088, model_decoder_layers_27_encoder_attn_q_proj_bias3, alloc1089)
        R.vm.kill_object(alloc1088)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias3)
        gv1826: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape986: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1089, gv1826, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1089)
        gv1827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape987: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape986, gv1827, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape986)
        gv1828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1828, R.dtype("float16"))
        _1088: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape987, alloc1090)
        R.vm.kill_object(reshape987)
        gv1829: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape988: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1090, gv1829, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1090)
        gv1830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape989: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape988, gv1830, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape988)
        model_decoder_layers_27_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151]
        model_decoder_layers_27_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1152]
        gv1831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1091: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1831, R.dtype("float16"))
        _1089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight3, reshape989, model_decoder_layers_27_encoder_attn_out_proj_bias3, alloc1091)
        R.vm.kill_object(reshape989)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias3)
        gv1832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1832, R.dtype("float16"))
        cls.add(alloc1087, alloc1091, alloc1092)
        R.vm.kill_object(alloc1087)
        R.vm.kill_object(alloc1091)
        model_decoder_layers_27_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1159]
        model_decoder_layers_27_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1160]
        gv1833: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1833, R.dtype("float16"))
        cls.layer_norm(alloc1092, model_decoder_layers_27_final_layer_norm_weight3, model_decoder_layers_27_final_layer_norm_bias3, alloc1093)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias3)
        model_decoder_layers_27_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155]
        model_decoder_layers_27_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1156]
        gv1834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1834, R.dtype("float16"))
        _1092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_27_fc1_weight3, alloc1093, model_decoder_layers_27_fc1_bias3, alloc1094)
        R.vm.kill_object(alloc1093)
        R.vm.kill_object(model_decoder_layers_27_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_27_fc1_bias3)
        model_decoder_layers_27_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157]
        model_decoder_layers_27_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1158]
        gv1835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1835, R.dtype("float16"))
        _1093: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_27_fc2_weight3, alloc1094, model_decoder_layers_27_fc2_bias3, alloc1095)
        R.vm.kill_object(alloc1094)
        R.vm.kill_object(model_decoder_layers_27_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_27_fc2_bias3)
        gv1836: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1836, R.dtype("float16"))
        cls.add(alloc1092, alloc1095, alloc1096)
        R.vm.kill_object(alloc1092)
        R.vm.kill_object(alloc1095)
        model_decoder_layers_28_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1168]
        model_decoder_layers_28_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1169]
        gv1837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1837, R.dtype("float16"))
        cls.layer_norm(alloc1096, model_decoder_layers_28_self_attn_layer_norm_weight3, model_decoder_layers_28_self_attn_layer_norm_bias3, alloc1097)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias3)
        model_decoder_layers_28_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164]
        model_decoder_layers_28_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1165]
        gv1838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1838, R.dtype("float16"))
        _1096: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_q_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_q_proj_bias3, alloc1098)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias3)
        gv1839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape990: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1098, gv1839, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1098)
        model_decoder_layers_28_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161]
        gv1840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1840, R.dtype("float16"))
        _1097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_28_self_attn_k_proj_weight3, alloc1097, alloc1099)
        R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight3)
        gv1841: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape991: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1099, gv1841, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1099)
        model_decoder_layers_28_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162]
        model_decoder_layers_28_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1163]
        gv1842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1842, R.dtype("float16"))
        _1098: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_v_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_v_proj_bias3, alloc1100)
        R.vm.kill_object(alloc1097)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias3)
        gv1843: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape992: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1100, gv1843, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1100)
        gv1844: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1101: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1844, R.dtype("float16"))
        cls.concatenate(reshape990, reshape991, reshape992, alloc1101)
        R.vm.kill_object(reshape990)
        R.vm.kill_object(reshape991)
        R.vm.kill_object(reshape992)
        gv1845: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape993: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1101, gv1845, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1101)
        gv1846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1846, R.dtype("float16"))
        _1100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape993, alloc1102)
        R.vm.kill_object(reshape993)
        gv1847: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape994: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1102, gv1847, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1102)
        gv1848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape995: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape994, gv1848, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape994)
        model_decoder_layers_28_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166]
        model_decoder_layers_28_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1167]
        gv1849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1849, R.dtype("float16"))
        _1101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_out_proj_weight3, reshape995, model_decoder_layers_28_self_attn_out_proj_bias3, alloc1103)
        R.vm.kill_object(reshape995)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias3)
        gv1850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1850, R.dtype("float16"))
        cls.add(alloc1096, alloc1103, alloc1104)
        R.vm.kill_object(alloc1096)
        R.vm.kill_object(alloc1103)
        model_decoder_layers_28_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1177]
        model_decoder_layers_28_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1178]
        gv1851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1851, R.dtype("float16"))
        cls.layer_norm(alloc1104, model_decoder_layers_28_encoder_attn_layer_norm_weight3, model_decoder_layers_28_encoder_attn_layer_norm_bias3, alloc1105)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias3)
        model_decoder_layers_28_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173]
        model_decoder_layers_28_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1174]
        gv1852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1852, R.dtype("float16"))
        _1104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight3, alloc1105, model_decoder_layers_28_encoder_attn_q_proj_bias3, alloc1106)
        R.vm.kill_object(alloc1105)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias3)
        gv1853: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape996: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1106, gv1853, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1106)
        gv1854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape997: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape996, gv1854, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape996)
        gv1855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1855, R.dtype("float16"))
        _1105: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape997, alloc1107)
        R.vm.kill_object(reshape997)
        gv1856: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape998: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1107, gv1856, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1107)
        gv1857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape999: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape998, gv1857, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape998)
        model_decoder_layers_28_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175]
        model_decoder_layers_28_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1176]
        gv1858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1858, R.dtype("float16"))
        _1106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight3, reshape999, model_decoder_layers_28_encoder_attn_out_proj_bias3, alloc1108)
        R.vm.kill_object(reshape999)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias3)
        gv1859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1859, R.dtype("float16"))
        cls.add(alloc1104, alloc1108, alloc1109)
        R.vm.kill_object(alloc1104)
        R.vm.kill_object(alloc1108)
        model_decoder_layers_28_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1183]
        model_decoder_layers_28_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1184]
        gv1860: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1860, R.dtype("float16"))
        cls.layer_norm(alloc1109, model_decoder_layers_28_final_layer_norm_weight3, model_decoder_layers_28_final_layer_norm_bias3, alloc1110)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias3)
        model_decoder_layers_28_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179]
        model_decoder_layers_28_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1180]
        gv1861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1861, R.dtype("float16"))
        _1109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_28_fc1_weight3, alloc1110, model_decoder_layers_28_fc1_bias3, alloc1111)
        R.vm.kill_object(alloc1110)
        R.vm.kill_object(model_decoder_layers_28_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_28_fc1_bias3)
        model_decoder_layers_28_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181]
        model_decoder_layers_28_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1182]
        gv1862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1862, R.dtype("float16"))
        _1110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_28_fc2_weight3, alloc1111, model_decoder_layers_28_fc2_bias3, alloc1112)
        R.vm.kill_object(alloc1111)
        R.vm.kill_object(model_decoder_layers_28_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_28_fc2_bias3)
        gv1863: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1863, R.dtype("float16"))
        cls.add(alloc1109, alloc1112, alloc1113)
        R.vm.kill_object(alloc1109)
        R.vm.kill_object(alloc1112)
        model_decoder_layers_29_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1192]
        model_decoder_layers_29_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1193]
        gv1864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1864, R.dtype("float16"))
        cls.layer_norm(alloc1113, model_decoder_layers_29_self_attn_layer_norm_weight3, model_decoder_layers_29_self_attn_layer_norm_bias3, alloc1114)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias3)
        model_decoder_layers_29_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188]
        model_decoder_layers_29_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1189]
        gv1865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1865, R.dtype("float16"))
        _1113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_q_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_q_proj_bias3, alloc1115)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias3)
        gv1866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1000: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1115, gv1866, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1115)
        model_decoder_layers_29_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185]
        gv1867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1867, R.dtype("float16"))
        _1114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_29_self_attn_k_proj_weight3, alloc1114, alloc1116)
        R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight3)
        gv1868: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1001: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1116, gv1868, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1116)
        model_decoder_layers_29_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186]
        model_decoder_layers_29_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1187]
        gv1869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1869, R.dtype("float16"))
        _1115: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_v_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_v_proj_bias3, alloc1117)
        R.vm.kill_object(alloc1114)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias3)
        gv1870: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1002: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1117, gv1870, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1117)
        gv1871: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1118: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1871, R.dtype("float16"))
        cls.concatenate(reshape1000, reshape1001, reshape1002, alloc1118)
        R.vm.kill_object(reshape1000)
        R.vm.kill_object(reshape1001)
        R.vm.kill_object(reshape1002)
        gv1872: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1003: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1118, gv1872, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1118)
        gv1873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1873, R.dtype("float16"))
        _1117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1003, alloc1119)
        R.vm.kill_object(reshape1003)
        gv1874: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1004: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1119, gv1874, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1119)
        gv1875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1005: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1004, gv1875, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1004)
        model_decoder_layers_29_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190]
        model_decoder_layers_29_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1191]
        gv1876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1876, R.dtype("float16"))
        _1118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_out_proj_weight3, reshape1005, model_decoder_layers_29_self_attn_out_proj_bias3, alloc1120)
        R.vm.kill_object(reshape1005)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias3)
        gv1877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1877, R.dtype("float16"))
        cls.add(alloc1113, alloc1120, alloc1121)
        R.vm.kill_object(alloc1113)
        R.vm.kill_object(alloc1120)
        model_decoder_layers_29_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1201]
        model_decoder_layers_29_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1202]
        gv1878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1878, R.dtype("float16"))
        cls.layer_norm(alloc1121, model_decoder_layers_29_encoder_attn_layer_norm_weight3, model_decoder_layers_29_encoder_attn_layer_norm_bias3, alloc1122)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias3)
        model_decoder_layers_29_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197]
        model_decoder_layers_29_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1198]
        gv1879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1879, R.dtype("float16"))
        _1121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight3, alloc1122, model_decoder_layers_29_encoder_attn_q_proj_bias3, alloc1123)
        R.vm.kill_object(alloc1122)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias3)
        gv1880: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1006: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1123, gv1880, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1123)
        gv1881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1007: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1006, gv1881, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1006)
        gv1882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1882, R.dtype("float16"))
        _1122: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1007, alloc1124)
        R.vm.kill_object(reshape1007)
        gv1883: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1008: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1124, gv1883, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1124)
        gv1884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1009: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1008, gv1884, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1008)
        model_decoder_layers_29_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199]
        model_decoder_layers_29_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1200]
        gv1885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1885, R.dtype("float16"))
        _1123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight3, reshape1009, model_decoder_layers_29_encoder_attn_out_proj_bias3, alloc1125)
        R.vm.kill_object(reshape1009)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias3)
        gv1886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1886, R.dtype("float16"))
        cls.add(alloc1121, alloc1125, alloc1126)
        R.vm.kill_object(alloc1121)
        R.vm.kill_object(alloc1125)
        model_decoder_layers_29_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1207]
        model_decoder_layers_29_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1208]
        gv1887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1887, R.dtype("float16"))
        cls.layer_norm(alloc1126, model_decoder_layers_29_final_layer_norm_weight3, model_decoder_layers_29_final_layer_norm_bias3, alloc1127)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias3)
        model_decoder_layers_29_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203]
        model_decoder_layers_29_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1204]
        gv1888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1888, R.dtype("float16"))
        _1126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_29_fc1_weight3, alloc1127, model_decoder_layers_29_fc1_bias3, alloc1128)
        R.vm.kill_object(alloc1127)
        R.vm.kill_object(model_decoder_layers_29_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_29_fc1_bias3)
        model_decoder_layers_29_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205]
        model_decoder_layers_29_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1206]
        gv1889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1889, R.dtype("float16"))
        _1127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_29_fc2_weight3, alloc1128, model_decoder_layers_29_fc2_bias3, alloc1129)
        R.vm.kill_object(alloc1128)
        R.vm.kill_object(model_decoder_layers_29_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_29_fc2_bias3)
        gv1890: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1890, R.dtype("float16"))
        cls.add(alloc1126, alloc1129, alloc1130)
        R.vm.kill_object(alloc1126)
        R.vm.kill_object(alloc1129)
        model_decoder_layers_30_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1216]
        model_decoder_layers_30_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1217]
        gv1891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1891, R.dtype("float16"))
        cls.layer_norm(alloc1130, model_decoder_layers_30_self_attn_layer_norm_weight3, model_decoder_layers_30_self_attn_layer_norm_bias3, alloc1131)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias3)
        model_decoder_layers_30_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212]
        model_decoder_layers_30_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1213]
        gv1892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1892, R.dtype("float16"))
        _1130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_q_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_q_proj_bias3, alloc1132)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias3)
        gv1893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1010: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1132, gv1893, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1132)
        model_decoder_layers_30_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209]
        gv1894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1894, R.dtype("float16"))
        _1131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_30_self_attn_k_proj_weight3, alloc1131, alloc1133)
        R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight3)
        gv1895: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1011: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1133, gv1895, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1133)
        model_decoder_layers_30_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210]
        model_decoder_layers_30_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1211]
        gv1896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1896, R.dtype("float16"))
        _1132: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_v_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_v_proj_bias3, alloc1134)
        R.vm.kill_object(alloc1131)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias3)
        gv1897: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1012: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1134, gv1897, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1134)
        gv1898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1135: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1898, R.dtype("float16"))
        cls.concatenate(reshape1010, reshape1011, reshape1012, alloc1135)
        R.vm.kill_object(reshape1010)
        R.vm.kill_object(reshape1011)
        R.vm.kill_object(reshape1012)
        gv1899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1013: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1135, gv1899, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1135)
        gv1900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1900, R.dtype("float16"))
        _1134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1013, alloc1136)
        R.vm.kill_object(reshape1013)
        gv1901: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1014: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1136, gv1901, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1136)
        gv1902: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1015: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1014, gv1902, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1014)
        model_decoder_layers_30_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214]
        model_decoder_layers_30_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1215]
        gv1903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1903, R.dtype("float16"))
        _1135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_out_proj_weight3, reshape1015, model_decoder_layers_30_self_attn_out_proj_bias3, alloc1137)
        R.vm.kill_object(reshape1015)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias3)
        gv1904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1904, R.dtype("float16"))
        cls.add(alloc1130, alloc1137, alloc1138)
        R.vm.kill_object(alloc1130)
        R.vm.kill_object(alloc1137)
        model_decoder_layers_30_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1225]
        model_decoder_layers_30_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1226]
        gv1905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1905, R.dtype("float16"))
        cls.layer_norm(alloc1138, model_decoder_layers_30_encoder_attn_layer_norm_weight3, model_decoder_layers_30_encoder_attn_layer_norm_bias3, alloc1139)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias3)
        model_decoder_layers_30_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221]
        model_decoder_layers_30_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1222]
        gv1906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1906, R.dtype("float16"))
        _1138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight3, alloc1139, model_decoder_layers_30_encoder_attn_q_proj_bias3, alloc1140)
        R.vm.kill_object(alloc1139)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias3)
        gv1907: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1016: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1140, gv1907, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1140)
        gv1908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1017: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1016, gv1908, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1016)
        gv1909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1909, R.dtype("float16"))
        _1139: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1017, alloc1141)
        R.vm.kill_object(reshape1017)
        gv1910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1018: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1141, gv1910, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1141)
        gv1911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1019: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1018, gv1911, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1018)
        model_decoder_layers_30_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223]
        model_decoder_layers_30_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1224]
        gv1912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1912, R.dtype("float16"))
        _1140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight3, reshape1019, model_decoder_layers_30_encoder_attn_out_proj_bias3, alloc1142)
        R.vm.kill_object(reshape1019)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias3)
        gv1913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1913, R.dtype("float16"))
        cls.add(alloc1138, alloc1142, alloc1143)
        R.vm.kill_object(alloc1138)
        R.vm.kill_object(alloc1142)
        model_decoder_layers_30_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1231]
        model_decoder_layers_30_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1232]
        gv1914: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1914, R.dtype("float16"))
        cls.layer_norm(alloc1143, model_decoder_layers_30_final_layer_norm_weight3, model_decoder_layers_30_final_layer_norm_bias3, alloc1144)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias3)
        model_decoder_layers_30_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227]
        model_decoder_layers_30_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1228]
        gv1915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1915, R.dtype("float16"))
        _1143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_30_fc1_weight3, alloc1144, model_decoder_layers_30_fc1_bias3, alloc1145)
        R.vm.kill_object(alloc1144)
        R.vm.kill_object(model_decoder_layers_30_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_30_fc1_bias3)
        model_decoder_layers_30_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229]
        model_decoder_layers_30_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1230]
        gv1916: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1916, R.dtype("float16"))
        _1144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_30_fc2_weight3, alloc1145, model_decoder_layers_30_fc2_bias3, alloc1146)
        R.vm.kill_object(alloc1145)
        R.vm.kill_object(model_decoder_layers_30_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_30_fc2_bias3)
        gv1917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1917, R.dtype("float16"))
        cls.add(alloc1143, alloc1146, alloc1147)
        R.vm.kill_object(alloc1143)
        R.vm.kill_object(alloc1146)
        model_decoder_layers_31_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1240]
        model_decoder_layers_31_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1241]
        gv1918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1918, R.dtype("float16"))
        cls.layer_norm(alloc1147, model_decoder_layers_31_self_attn_layer_norm_weight3, model_decoder_layers_31_self_attn_layer_norm_bias3, alloc1148)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias3)
        model_decoder_layers_31_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236]
        model_decoder_layers_31_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1237]
        gv1919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1919, R.dtype("float16"))
        _1147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_q_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_q_proj_bias3, alloc1149)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias3)
        gv1920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1020: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1149, gv1920, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1149)
        model_decoder_layers_31_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233]
        gv1921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1921, R.dtype("float16"))
        _1148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_31_self_attn_k_proj_weight3, alloc1148, alloc1150)
        R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight3)
        gv1922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1021: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1150, gv1922, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1150)
        model_decoder_layers_31_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234]
        model_decoder_layers_31_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1235]
        gv1923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1923, R.dtype("float16"))
        _1149: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_v_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_v_proj_bias3, alloc1151)
        R.vm.kill_object(alloc1148)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight3)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias3)
        gv1924: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1022: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1151, gv1924, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1151)
        gv1925: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1152: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1925, R.dtype("float16"))
        cls.concatenate(reshape1020, reshape1021, reshape1022, alloc1152)
        R.vm.kill_object(reshape1020)
        R.vm.kill_object(reshape1021)
        R.vm.kill_object(reshape1022)
        gv1926: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1023: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1152, gv1926, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1152)
        gv1927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1927, R.dtype("float16"))
        _1151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1023, alloc1153)
        R.vm.kill_object(reshape1023)
        gv1928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1024: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1153, gv1928, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1153)
        gv1929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1025: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1024, gv1929, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1024)
        model_decoder_layers_31_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238]
        model_decoder_layers_31_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1239]
        gv1930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1930, R.dtype("float16"))
        _1152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_out_proj_weight3, reshape1025, model_decoder_layers_31_self_attn_out_proj_bias3, alloc1154)
        R.vm.kill_object(reshape1025)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias3)
        gv1931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1931, R.dtype("float16"))
        cls.add(alloc1147, alloc1154, alloc1155)
        R.vm.kill_object(alloc1147)
        R.vm.kill_object(alloc1154)
        model_decoder_layers_31_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1249]
        model_decoder_layers_31_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1250]
        gv1932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1932, R.dtype("float16"))
        cls.layer_norm(alloc1155, model_decoder_layers_31_encoder_attn_layer_norm_weight3, model_decoder_layers_31_encoder_attn_layer_norm_bias3, alloc1156)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias3)
        model_decoder_layers_31_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245]
        model_decoder_layers_31_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1246]
        gv1933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1933, R.dtype("float16"))
        _1155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight3, alloc1156, model_decoder_layers_31_encoder_attn_q_proj_bias3, alloc1157)
        R.vm.kill_object(alloc1156)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight3)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias3)
        gv1934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1026: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1157, gv1934, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1157)
        gv1935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1027: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1026, gv1935, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1026)
        gv1936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1936, R.dtype("float16"))
        _1156: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1027, alloc1158)
        R.vm.kill_object(reshape1027)
        gv1937: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1028: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1158, gv1937, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1158)
        gv1938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1029: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1028, gv1938, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1028)
        model_decoder_layers_31_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247]
        model_decoder_layers_31_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1248]
        gv1939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1939, R.dtype("float16"))
        _1157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight3, reshape1029, model_decoder_layers_31_encoder_attn_out_proj_bias3, alloc1159)
        R.vm.kill_object(reshape1029)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight3)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias3)
        gv1940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1940, R.dtype("float16"))
        R.vm.kill_object(storage15)
        cls.add(alloc1155, alloc1159, alloc1160)
        R.vm.kill_object(alloc1155)
        R.vm.kill_object(alloc1159)
        model_decoder_layers_31_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1255]
        model_decoder_layers_31_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1256]
        gv1941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1941, R.dtype("float16"))
        cls.layer_norm(alloc1160, model_decoder_layers_31_final_layer_norm_weight3, model_decoder_layers_31_final_layer_norm_bias3, alloc1161)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias3)
        model_decoder_layers_31_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251]
        model_decoder_layers_31_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1252]
        gv1942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1942, R.dtype("float16"))
        R.vm.kill_object(storage13)
        _1160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_31_fc1_weight3, alloc1161, model_decoder_layers_31_fc1_bias3, alloc1162)
        R.vm.kill_object(alloc1161)
        R.vm.kill_object(model_decoder_layers_31_fc1_weight3)
        R.vm.kill_object(model_decoder_layers_31_fc1_bias3)
        model_decoder_layers_31_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253]
        model_decoder_layers_31_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1254]
        gv1943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1943, R.dtype("float16"))
        R.vm.kill_object(storage14)
        _1161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_31_fc2_weight3, alloc1162, model_decoder_layers_31_fc2_bias3, alloc1163)
        R.vm.kill_object(alloc1162)
        R.vm.kill_object(model_decoder_layers_31_fc2_weight3)
        R.vm.kill_object(model_decoder_layers_31_fc2_bias3)
        gv1944: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1944, R.dtype("float16"))
        R.vm.kill_object(storage16)
        cls.add(alloc1160, alloc1163, alloc1164)
        R.vm.kill_object(alloc1160)
        R.vm.kill_object(alloc1163)
        model_decoder_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1257]
        model_decoder_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1258]
        gv1945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1945, R.dtype("float16"))
        R.vm.kill_object(storage17)
        cls.layer_norm(alloc1164, model_decoder_layer_norm_weight3, model_decoder_layer_norm_bias3, alloc1165)
        R.vm.kill_object(alloc1164)
        R.vm.kill_object(model_decoder_layer_norm_weight3)
        R.vm.kill_object(model_decoder_layer_norm_bias3)
        storage18: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),))
        alloc1166: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage18, R.prim_value(0), gv1946, R.dtype("float32"))
        R.vm.kill_object(storage18)
        _1164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul4_cublas", model_decoder_embed_tokens_weight3, alloc1165, alloc1166)
        R.vm.kill_object(model_decoder_embed_tokens_weight3)
        R.vm.kill_object(alloc1165)
        R.call_packed("vm.builtin.match_shape", alloc1166, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_decode, loc=return, annotation=R.Tensor((batch_size, 1, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        return alloc1166

    @R.function
    def batch_encode(input_features: R.Tensor(("batch_size", 128, 3000), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1500, 1280), dtype="float16"):
        batch_size = T.int64()
        R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", input_features, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_encode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", input_features, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(128), R.prim_value(0), R.prim_value(3000), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func1(shape_heap)
        lv: R.Tensor((1280,), dtype="float16") = packed_params[1]
        lv1: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),))
        R.vm.kill_object(lv)
        lv2: R.Tensor((1280,), dtype="float16") = packed_params[3]
        lv3: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv2, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),))
        R.vm.kill_object(lv2)
        model_encoder_conv1_weight: R.Tensor((1280, 128, 3), dtype="float16") = packed_params[0]
        storage24: R.Object = R.vm.alloc_storage(R.shape([122880000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(3000), sinfo_args=(R.Shape(ndim=3),))
        alloc1620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1947, R.dtype("float16"))
        cls.fused_conv1d_add1_gelu(input_features, model_encoder_conv1_weight, lv1, alloc1620)
        R.vm.kill_object(lv1)
        R.vm.kill_object(model_encoder_conv1_weight)
        model_encoder_conv2_weight: R.Tensor((1280, 1280, 3), dtype="float16") = packed_params[2]
        storage25: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(1500), sinfo_args=(R.Shape(ndim=3),))
        alloc1621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1948, R.dtype("float16"))
        cls.fused_conv1d1_add2_gelu1(alloc1620, model_encoder_conv2_weight, lv3, alloc1621)
        R.vm.kill_object(lv3)
        R.vm.kill_object(alloc1620)
        R.vm.kill_object(model_encoder_conv2_weight)
        lv6: R.Tensor((1500, 1280), dtype="float16") = packed_params[4]
        gv1949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1949, R.dtype("float16"))
        cls.fused_transpose_add3(lv6, alloc1621, alloc1622)
        R.vm.kill_object(alloc1621)
        R.vm.kill_object(lv6)
        model_encoder_layers_0_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[12]
        model_encoder_layers_0_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[13]
        gv1950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1950, R.dtype("float16"))
        cls.layer_norm1(alloc1622, model_encoder_layers_0_self_attn_layer_norm_weight, model_encoder_layers_0_self_attn_layer_norm_bias, alloc1623)
        R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_bias)
        model_encoder_layers_0_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[8]
        model_encoder_layers_0_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[9]
        storage26: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1951, R.dtype("float16"))
        _1622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_q_proj_weight, alloc1623, model_encoder_layers_0_self_attn_q_proj_bias, alloc1624)
        R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_bias)
        gv1952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1624, gv1952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1624)
        model_encoder_layers_0_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[5]
        storage27: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1625: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1953, R.dtype("float16"))
        _1623: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_0_self_attn_k_proj_weight, alloc1623, alloc1625)
        R.vm.kill_object(model_encoder_layers_0_self_attn_k_proj_weight)
        gv1954: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1625, gv1954, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1625)
        model_encoder_layers_0_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[6]
        model_encoder_layers_0_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[7]
        storage28: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1955, R.dtype("float16"))
        _1624: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_v_proj_weight, alloc1623, model_encoder_layers_0_self_attn_v_proj_bias, alloc1626)
        R.vm.kill_object(alloc1623)
        R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_bias)
        gv1956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape2: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1626, gv1956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1626)
        gv1957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape3: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape, gv1957, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape)
        gv1958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape4: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1, gv1958, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1)
        gv1959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape5: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape2, gv1959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape2)
        gv1960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1960, R.dtype("float16"))
        _1625: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape3, reshape4, reshape5, alloc1627)
        R.vm.kill_object(reshape3)
        R.vm.kill_object(reshape4)
        R.vm.kill_object(reshape5)
        gv1961: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape6: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1627, gv1961, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1627)
        gv1962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape7: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape6, gv1962, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape6)
        model_encoder_layers_0_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[10]
        model_encoder_layers_0_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[11]
        gv1963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1963, R.dtype("float16"))
        _1626: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_out_proj_weight, reshape7, model_encoder_layers_0_self_attn_out_proj_bias, alloc1628)
        R.vm.kill_object(reshape7)
        R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_bias)
        gv1964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1964, R.dtype("float16"))
        cls.add4(alloc1622, alloc1628, alloc1629)
        R.vm.kill_object(alloc1622)
        R.vm.kill_object(alloc1628)
        model_encoder_layers_0_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[18]
        model_encoder_layers_0_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[19]
        gv1965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1965, R.dtype("float16"))
        cls.layer_norm1(alloc1629, model_encoder_layers_0_final_layer_norm_weight, model_encoder_layers_0_final_layer_norm_bias, alloc1630)
        R.vm.kill_object(model_encoder_layers_0_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_0_final_layer_norm_bias)
        model_encoder_layers_0_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[14]
        model_encoder_layers_0_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[15]
        gv1966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1966, R.dtype("float16"))
        _1629: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_0_fc1_weight, alloc1630, model_encoder_layers_0_fc1_bias, alloc1631)
        R.vm.kill_object(alloc1630)
        R.vm.kill_object(model_encoder_layers_0_fc1_weight)
        R.vm.kill_object(model_encoder_layers_0_fc1_bias)
        model_encoder_layers_0_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[16]
        model_encoder_layers_0_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[17]
        gv1967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1967, R.dtype("float16"))
        _1630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_0_fc2_weight, alloc1631, model_encoder_layers_0_fc2_bias, alloc1632)
        R.vm.kill_object(alloc1631)
        R.vm.kill_object(model_encoder_layers_0_fc2_weight)
        R.vm.kill_object(model_encoder_layers_0_fc2_bias)
        gv1968: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1968, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1629, alloc1632, alloc1633)
        R.vm.kill_object(alloc1629)
        R.vm.kill_object(alloc1632)
        model_encoder_layers_1_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[27]
        model_encoder_layers_1_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[28]
        gv1969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1969, R.dtype("float16"))
        cls.layer_norm1(alloc1633, model_encoder_layers_1_self_attn_layer_norm_weight, model_encoder_layers_1_self_attn_layer_norm_bias, alloc1634)
        R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_bias)
        model_encoder_layers_1_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[23]
        model_encoder_layers_1_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[24]
        gv1970: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1970, R.dtype("float16"))
        _1633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_q_proj_weight, alloc1634, model_encoder_layers_1_self_attn_q_proj_bias, alloc1635)
        R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_bias)
        gv1971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape8: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1635, gv1971, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1635)
        model_encoder_layers_1_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[20]
        gv1972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1972, R.dtype("float16"))
        _1634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_1_self_attn_k_proj_weight, alloc1634, alloc1636)
        R.vm.kill_object(model_encoder_layers_1_self_attn_k_proj_weight)
        gv1973: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape9: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1636, gv1973, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1636)
        model_encoder_layers_1_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[21]
        model_encoder_layers_1_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[22]
        gv1974: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1974, R.dtype("float16"))
        _1635: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_v_proj_weight, alloc1634, model_encoder_layers_1_self_attn_v_proj_bias, alloc1637)
        R.vm.kill_object(alloc1634)
        R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_bias)
        gv1975: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape10: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1637, gv1975, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1637)
        gv1976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape11: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape8, gv1976, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape8)
        gv1977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape12: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape9, gv1977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape9)
        gv1978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape13: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape10, gv1978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape10)
        gv1979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1979, R.dtype("float16"))
        _1636: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape11, reshape12, reshape13, alloc1638)
        R.vm.kill_object(reshape11)
        R.vm.kill_object(reshape12)
        R.vm.kill_object(reshape13)
        gv1980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape14: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1638, gv1980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1638)
        gv1981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape15: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape14, gv1981, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape14)
        model_encoder_layers_1_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[25]
        model_encoder_layers_1_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[26]
        gv1982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1982, R.dtype("float16"))
        _1637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_out_proj_weight, reshape15, model_encoder_layers_1_self_attn_out_proj_bias, alloc1639)
        R.vm.kill_object(reshape15)
        R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_bias)
        gv1983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1983, R.dtype("float16"))
        cls.add4(alloc1633, alloc1639, alloc1640)
        R.vm.kill_object(alloc1633)
        R.vm.kill_object(alloc1639)
        model_encoder_layers_1_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[33]
        model_encoder_layers_1_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[34]
        gv1984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1984, R.dtype("float16"))
        cls.layer_norm1(alloc1640, model_encoder_layers_1_final_layer_norm_weight, model_encoder_layers_1_final_layer_norm_bias, alloc1641)
        R.vm.kill_object(model_encoder_layers_1_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_1_final_layer_norm_bias)
        model_encoder_layers_1_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[29]
        model_encoder_layers_1_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[30]
        gv1985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1642: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1985, R.dtype("float16"))
        _1640: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_1_fc1_weight, alloc1641, model_encoder_layers_1_fc1_bias, alloc1642)
        R.vm.kill_object(alloc1641)
        R.vm.kill_object(model_encoder_layers_1_fc1_weight)
        R.vm.kill_object(model_encoder_layers_1_fc1_bias)
        model_encoder_layers_1_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[31]
        model_encoder_layers_1_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[32]
        gv1986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1986, R.dtype("float16"))
        _1641: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_1_fc2_weight, alloc1642, model_encoder_layers_1_fc2_bias, alloc1643)
        R.vm.kill_object(alloc1642)
        R.vm.kill_object(model_encoder_layers_1_fc2_weight)
        R.vm.kill_object(model_encoder_layers_1_fc2_bias)
        gv1987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1987, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1640, alloc1643, alloc1644)
        R.vm.kill_object(alloc1640)
        R.vm.kill_object(alloc1643)
        model_encoder_layers_2_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[42]
        model_encoder_layers_2_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[43]
        gv1988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1988, R.dtype("float16"))
        cls.layer_norm1(alloc1644, model_encoder_layers_2_self_attn_layer_norm_weight, model_encoder_layers_2_self_attn_layer_norm_bias, alloc1645)
        R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_bias)
        model_encoder_layers_2_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[38]
        model_encoder_layers_2_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[39]
        gv1989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1989, R.dtype("float16"))
        _1644: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_q_proj_weight, alloc1645, model_encoder_layers_2_self_attn_q_proj_bias, alloc1646)
        R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_bias)
        gv1990: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape16: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1646, gv1990, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1646)
        model_encoder_layers_2_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[35]
        gv1991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1991, R.dtype("float16"))
        _1645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_2_self_attn_k_proj_weight, alloc1645, alloc1647)
        R.vm.kill_object(model_encoder_layers_2_self_attn_k_proj_weight)
        gv1992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape17: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1647, gv1992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1647)
        model_encoder_layers_2_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[36]
        model_encoder_layers_2_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[37]
        gv1993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1993, R.dtype("float16"))
        _1646: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_v_proj_weight, alloc1645, model_encoder_layers_2_self_attn_v_proj_bias, alloc1648)
        R.vm.kill_object(alloc1645)
        R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_bias)
        gv1994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape18: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1648, gv1994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1648)
        gv1995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape19: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape16, gv1995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape16)
        gv1996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape20: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape17, gv1996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape17)
        gv1997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape21: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape18, gv1997, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape18)
        gv1998: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1998, R.dtype("float16"))
        _1647: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape19, reshape20, reshape21, alloc1649)
        R.vm.kill_object(reshape19)
        R.vm.kill_object(reshape20)
        R.vm.kill_object(reshape21)
        gv1999: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape22: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1649, gv1999, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1649)
        gv2000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape23: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape22, gv2000, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape22)
        model_encoder_layers_2_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[40]
        model_encoder_layers_2_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[41]
        gv2001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2001, R.dtype("float16"))
        _1648: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_out_proj_weight, reshape23, model_encoder_layers_2_self_attn_out_proj_bias, alloc1650)
        R.vm.kill_object(reshape23)
        R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_bias)
        gv2002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2002, R.dtype("float16"))
        cls.add4(alloc1644, alloc1650, alloc1651)
        R.vm.kill_object(alloc1644)
        R.vm.kill_object(alloc1650)
        model_encoder_layers_2_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[48]
        model_encoder_layers_2_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[49]
        gv2003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2003, R.dtype("float16"))
        cls.layer_norm1(alloc1651, model_encoder_layers_2_final_layer_norm_weight, model_encoder_layers_2_final_layer_norm_bias, alloc1652)
        R.vm.kill_object(model_encoder_layers_2_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_2_final_layer_norm_bias)
        model_encoder_layers_2_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[44]
        model_encoder_layers_2_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[45]
        gv2004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2004, R.dtype("float16"))
        _1651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_2_fc1_weight, alloc1652, model_encoder_layers_2_fc1_bias, alloc1653)
        R.vm.kill_object(alloc1652)
        R.vm.kill_object(model_encoder_layers_2_fc1_weight)
        R.vm.kill_object(model_encoder_layers_2_fc1_bias)
        model_encoder_layers_2_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[46]
        model_encoder_layers_2_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[47]
        gv2005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2005, R.dtype("float16"))
        _1652: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_2_fc2_weight, alloc1653, model_encoder_layers_2_fc2_bias, alloc1654)
        R.vm.kill_object(alloc1653)
        R.vm.kill_object(model_encoder_layers_2_fc2_weight)
        R.vm.kill_object(model_encoder_layers_2_fc2_bias)
        gv2006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2006, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1651, alloc1654, alloc1655)
        R.vm.kill_object(alloc1651)
        R.vm.kill_object(alloc1654)
        model_encoder_layers_3_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[57]
        model_encoder_layers_3_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[58]
        gv2007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2007, R.dtype("float16"))
        cls.layer_norm1(alloc1655, model_encoder_layers_3_self_attn_layer_norm_weight, model_encoder_layers_3_self_attn_layer_norm_bias, alloc1656)
        R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_bias)
        model_encoder_layers_3_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[53]
        model_encoder_layers_3_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[54]
        gv2008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2008, R.dtype("float16"))
        _1655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_q_proj_weight, alloc1656, model_encoder_layers_3_self_attn_q_proj_bias, alloc1657)
        R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_bias)
        gv2009: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape24: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1657, gv2009, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1657)
        model_encoder_layers_3_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[50]
        gv2010: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2010, R.dtype("float16"))
        _1656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_3_self_attn_k_proj_weight, alloc1656, alloc1658)
        R.vm.kill_object(model_encoder_layers_3_self_attn_k_proj_weight)
        gv2011: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape25: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1658, gv2011, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1658)
        model_encoder_layers_3_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[51]
        model_encoder_layers_3_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[52]
        gv2012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1659: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2012, R.dtype("float16"))
        _1657: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_v_proj_weight, alloc1656, model_encoder_layers_3_self_attn_v_proj_bias, alloc1659)
        R.vm.kill_object(alloc1656)
        R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_bias)
        gv2013: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape26: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1659, gv2013, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1659)
        gv2014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape27: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape24, gv2014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape24)
        gv2015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape28: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape25, gv2015, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape25)
        gv2016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape29: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape26, gv2016, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape26)
        gv2017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2017, R.dtype("float16"))
        _1658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape27, reshape28, reshape29, alloc1660)
        R.vm.kill_object(reshape27)
        R.vm.kill_object(reshape28)
        R.vm.kill_object(reshape29)
        gv2018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape30: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1660, gv2018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1660)
        gv2019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape31: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape30, gv2019, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape30)
        model_encoder_layers_3_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[55]
        model_encoder_layers_3_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[56]
        gv2020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2020, R.dtype("float16"))
        _1659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_out_proj_weight, reshape31, model_encoder_layers_3_self_attn_out_proj_bias, alloc1661)
        R.vm.kill_object(reshape31)
        R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_bias)
        gv2021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2021, R.dtype("float16"))
        cls.add4(alloc1655, alloc1661, alloc1662)
        R.vm.kill_object(alloc1655)
        R.vm.kill_object(alloc1661)
        model_encoder_layers_3_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[63]
        model_encoder_layers_3_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[64]
        gv2022: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2022, R.dtype("float16"))
        cls.layer_norm1(alloc1662, model_encoder_layers_3_final_layer_norm_weight, model_encoder_layers_3_final_layer_norm_bias, alloc1663)
        R.vm.kill_object(model_encoder_layers_3_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_3_final_layer_norm_bias)
        model_encoder_layers_3_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[59]
        model_encoder_layers_3_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[60]
        gv2023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2023, R.dtype("float16"))
        _1662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_3_fc1_weight, alloc1663, model_encoder_layers_3_fc1_bias, alloc1664)
        R.vm.kill_object(alloc1663)
        R.vm.kill_object(model_encoder_layers_3_fc1_weight)
        R.vm.kill_object(model_encoder_layers_3_fc1_bias)
        model_encoder_layers_3_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[61]
        model_encoder_layers_3_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[62]
        gv2024: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2024, R.dtype("float16"))
        _1663: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_3_fc2_weight, alloc1664, model_encoder_layers_3_fc2_bias, alloc1665)
        R.vm.kill_object(alloc1664)
        R.vm.kill_object(model_encoder_layers_3_fc2_weight)
        R.vm.kill_object(model_encoder_layers_3_fc2_bias)
        gv2025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2025, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1662, alloc1665, alloc1666)
        R.vm.kill_object(alloc1662)
        R.vm.kill_object(alloc1665)
        model_encoder_layers_4_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[72]
        model_encoder_layers_4_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[73]
        gv2026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2026, R.dtype("float16"))
        cls.layer_norm1(alloc1666, model_encoder_layers_4_self_attn_layer_norm_weight, model_encoder_layers_4_self_attn_layer_norm_bias, alloc1667)
        R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_bias)
        model_encoder_layers_4_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[68]
        model_encoder_layers_4_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[69]
        gv2027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2027, R.dtype("float16"))
        _1666: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_q_proj_weight, alloc1667, model_encoder_layers_4_self_attn_q_proj_bias, alloc1668)
        R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_bias)
        gv2028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape32: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1668, gv2028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1668)
        model_encoder_layers_4_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[65]
        gv2029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2029, R.dtype("float16"))
        _1667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_4_self_attn_k_proj_weight, alloc1667, alloc1669)
        R.vm.kill_object(model_encoder_layers_4_self_attn_k_proj_weight)
        gv2030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape33: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1669, gv2030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1669)
        model_encoder_layers_4_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[66]
        model_encoder_layers_4_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[67]
        gv2031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2031, R.dtype("float16"))
        _1668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_v_proj_weight, alloc1667, model_encoder_layers_4_self_attn_v_proj_bias, alloc1670)
        R.vm.kill_object(alloc1667)
        R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_bias)
        gv2032: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape34: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1670, gv2032, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1670)
        gv2033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape35: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape32, gv2033, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape32)
        gv2034: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape36: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape33, gv2034, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape33)
        gv2035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape37: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape34, gv2035, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape34)
        gv2036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2036, R.dtype("float16"))
        _1669: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape35, reshape36, reshape37, alloc1671)
        R.vm.kill_object(reshape35)
        R.vm.kill_object(reshape36)
        R.vm.kill_object(reshape37)
        gv2037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape38: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1671, gv2037, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1671)
        gv2038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape39: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape38, gv2038, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape38)
        model_encoder_layers_4_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[70]
        model_encoder_layers_4_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[71]
        gv2039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2039, R.dtype("float16"))
        _1670: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_out_proj_weight, reshape39, model_encoder_layers_4_self_attn_out_proj_bias, alloc1672)
        R.vm.kill_object(reshape39)
        R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_bias)
        gv2040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2040, R.dtype("float16"))
        cls.add4(alloc1666, alloc1672, alloc1673)
        R.vm.kill_object(alloc1666)
        R.vm.kill_object(alloc1672)
        model_encoder_layers_4_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[78]
        model_encoder_layers_4_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[79]
        gv2041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2041, R.dtype("float16"))
        cls.layer_norm1(alloc1673, model_encoder_layers_4_final_layer_norm_weight, model_encoder_layers_4_final_layer_norm_bias, alloc1674)
        R.vm.kill_object(model_encoder_layers_4_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_4_final_layer_norm_bias)
        model_encoder_layers_4_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[74]
        model_encoder_layers_4_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[75]
        gv2042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2042, R.dtype("float16"))
        _1673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_4_fc1_weight, alloc1674, model_encoder_layers_4_fc1_bias, alloc1675)
        R.vm.kill_object(alloc1674)
        R.vm.kill_object(model_encoder_layers_4_fc1_weight)
        R.vm.kill_object(model_encoder_layers_4_fc1_bias)
        model_encoder_layers_4_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[76]
        model_encoder_layers_4_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[77]
        gv2043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1676: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2043, R.dtype("float16"))
        _1674: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_4_fc2_weight, alloc1675, model_encoder_layers_4_fc2_bias, alloc1676)
        R.vm.kill_object(alloc1675)
        R.vm.kill_object(model_encoder_layers_4_fc2_weight)
        R.vm.kill_object(model_encoder_layers_4_fc2_bias)
        gv2044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2044, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1673, alloc1676, alloc1677)
        R.vm.kill_object(alloc1673)
        R.vm.kill_object(alloc1676)
        model_encoder_layers_5_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[87]
        model_encoder_layers_5_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[88]
        gv2045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2045, R.dtype("float16"))
        cls.layer_norm1(alloc1677, model_encoder_layers_5_self_attn_layer_norm_weight, model_encoder_layers_5_self_attn_layer_norm_bias, alloc1678)
        R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_bias)
        model_encoder_layers_5_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[83]
        model_encoder_layers_5_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[84]
        gv2046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2046, R.dtype("float16"))
        _1677: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_q_proj_weight, alloc1678, model_encoder_layers_5_self_attn_q_proj_bias, alloc1679)
        R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_bias)
        gv2047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape40: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1679, gv2047, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1679)
        model_encoder_layers_5_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[80]
        gv2048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2048, R.dtype("float16"))
        _1678: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_5_self_attn_k_proj_weight, alloc1678, alloc1680)
        R.vm.kill_object(model_encoder_layers_5_self_attn_k_proj_weight)
        gv2049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape41: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1680, gv2049, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1680)
        model_encoder_layers_5_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[81]
        model_encoder_layers_5_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[82]
        gv2050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2050, R.dtype("float16"))
        _1679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_v_proj_weight, alloc1678, model_encoder_layers_5_self_attn_v_proj_bias, alloc1681)
        R.vm.kill_object(alloc1678)
        R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_bias)
        gv2051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape42: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1681, gv2051, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1681)
        gv2052: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape43: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape40, gv2052, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape40)
        gv2053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape44: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape41, gv2053, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape41)
        gv2054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape45: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape42, gv2054, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape42)
        gv2055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2055, R.dtype("float16"))
        _1680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape43, reshape44, reshape45, alloc1682)
        R.vm.kill_object(reshape43)
        R.vm.kill_object(reshape44)
        R.vm.kill_object(reshape45)
        gv2056: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape46: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1682, gv2056, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1682)
        gv2057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape47: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape46, gv2057, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape46)
        model_encoder_layers_5_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[85]
        model_encoder_layers_5_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[86]
        gv2058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2058, R.dtype("float16"))
        _1681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_out_proj_weight, reshape47, model_encoder_layers_5_self_attn_out_proj_bias, alloc1683)
        R.vm.kill_object(reshape47)
        R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_bias)
        gv2059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2059, R.dtype("float16"))
        cls.add4(alloc1677, alloc1683, alloc1684)
        R.vm.kill_object(alloc1677)
        R.vm.kill_object(alloc1683)
        model_encoder_layers_5_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[93]
        model_encoder_layers_5_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[94]
        gv2060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2060, R.dtype("float16"))
        cls.layer_norm1(alloc1684, model_encoder_layers_5_final_layer_norm_weight, model_encoder_layers_5_final_layer_norm_bias, alloc1685)
        R.vm.kill_object(model_encoder_layers_5_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_5_final_layer_norm_bias)
        model_encoder_layers_5_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[89]
        model_encoder_layers_5_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[90]
        gv2061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2061, R.dtype("float16"))
        _1684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_5_fc1_weight, alloc1685, model_encoder_layers_5_fc1_bias, alloc1686)
        R.vm.kill_object(alloc1685)
        R.vm.kill_object(model_encoder_layers_5_fc1_weight)
        R.vm.kill_object(model_encoder_layers_5_fc1_bias)
        model_encoder_layers_5_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[91]
        model_encoder_layers_5_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[92]
        gv2062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2062, R.dtype("float16"))
        _1685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_5_fc2_weight, alloc1686, model_encoder_layers_5_fc2_bias, alloc1687)
        R.vm.kill_object(alloc1686)
        R.vm.kill_object(model_encoder_layers_5_fc2_weight)
        R.vm.kill_object(model_encoder_layers_5_fc2_bias)
        gv2063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2063, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1684, alloc1687, alloc1688)
        R.vm.kill_object(alloc1684)
        R.vm.kill_object(alloc1687)
        model_encoder_layers_6_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[102]
        model_encoder_layers_6_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[103]
        gv2064: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2064, R.dtype("float16"))
        cls.layer_norm1(alloc1688, model_encoder_layers_6_self_attn_layer_norm_weight, model_encoder_layers_6_self_attn_layer_norm_bias, alloc1689)
        R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_bias)
        model_encoder_layers_6_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[98]
        model_encoder_layers_6_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[99]
        gv2065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2065, R.dtype("float16"))
        _1688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_q_proj_weight, alloc1689, model_encoder_layers_6_self_attn_q_proj_bias, alloc1690)
        R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_bias)
        gv2066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape48: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1690, gv2066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1690)
        model_encoder_layers_6_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[95]
        gv2067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2067, R.dtype("float16"))
        _1689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_6_self_attn_k_proj_weight, alloc1689, alloc1691)
        R.vm.kill_object(model_encoder_layers_6_self_attn_k_proj_weight)
        gv2068: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape49: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1691, gv2068, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1691)
        model_encoder_layers_6_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[96]
        model_encoder_layers_6_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[97]
        gv2069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2069, R.dtype("float16"))
        _1690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_v_proj_weight, alloc1689, model_encoder_layers_6_self_attn_v_proj_bias, alloc1692)
        R.vm.kill_object(alloc1689)
        R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_bias)
        gv2070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape50: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1692, gv2070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1692)
        gv2071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape51: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape48, gv2071, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape48)
        gv2072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape52: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape49, gv2072, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape49)
        gv2073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape53: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape50, gv2073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape50)
        gv2074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1693: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2074, R.dtype("float16"))
        _1691: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape51, reshape52, reshape53, alloc1693)
        R.vm.kill_object(reshape51)
        R.vm.kill_object(reshape52)
        R.vm.kill_object(reshape53)
        gv2075: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape54: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1693, gv2075, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1693)
        gv2076: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape55: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape54, gv2076, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape54)
        model_encoder_layers_6_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[100]
        model_encoder_layers_6_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[101]
        gv2077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2077, R.dtype("float16"))
        _1692: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_out_proj_weight, reshape55, model_encoder_layers_6_self_attn_out_proj_bias, alloc1694)
        R.vm.kill_object(reshape55)
        R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_bias)
        gv2078: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2078, R.dtype("float16"))
        cls.add4(alloc1688, alloc1694, alloc1695)
        R.vm.kill_object(alloc1688)
        R.vm.kill_object(alloc1694)
        model_encoder_layers_6_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[108]
        model_encoder_layers_6_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[109]
        gv2079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2079, R.dtype("float16"))
        cls.layer_norm1(alloc1695, model_encoder_layers_6_final_layer_norm_weight, model_encoder_layers_6_final_layer_norm_bias, alloc1696)
        R.vm.kill_object(model_encoder_layers_6_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_6_final_layer_norm_bias)
        model_encoder_layers_6_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[104]
        model_encoder_layers_6_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[105]
        gv2080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2080, R.dtype("float16"))
        _1695: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_6_fc1_weight, alloc1696, model_encoder_layers_6_fc1_bias, alloc1697)
        R.vm.kill_object(alloc1696)
        R.vm.kill_object(model_encoder_layers_6_fc1_weight)
        R.vm.kill_object(model_encoder_layers_6_fc1_bias)
        model_encoder_layers_6_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[106]
        model_encoder_layers_6_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[107]
        gv2081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2081, R.dtype("float16"))
        _1696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_6_fc2_weight, alloc1697, model_encoder_layers_6_fc2_bias, alloc1698)
        R.vm.kill_object(alloc1697)
        R.vm.kill_object(model_encoder_layers_6_fc2_weight)
        R.vm.kill_object(model_encoder_layers_6_fc2_bias)
        gv2082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2082, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1695, alloc1698, alloc1699)
        R.vm.kill_object(alloc1695)
        R.vm.kill_object(alloc1698)
        model_encoder_layers_7_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[117]
        model_encoder_layers_7_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[118]
        gv2083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2083, R.dtype("float16"))
        cls.layer_norm1(alloc1699, model_encoder_layers_7_self_attn_layer_norm_weight, model_encoder_layers_7_self_attn_layer_norm_bias, alloc1700)
        R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_bias)
        model_encoder_layers_7_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[113]
        model_encoder_layers_7_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[114]
        gv2084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2084, R.dtype("float16"))
        _1699: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_q_proj_weight, alloc1700, model_encoder_layers_7_self_attn_q_proj_bias, alloc1701)
        R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_bias)
        gv2085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape56: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1701, gv2085, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1701)
        model_encoder_layers_7_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[110]
        gv2086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2086, R.dtype("float16"))
        _1700: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_7_self_attn_k_proj_weight, alloc1700, alloc1702)
        R.vm.kill_object(model_encoder_layers_7_self_attn_k_proj_weight)
        gv2087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape57: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1702, gv2087, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1702)
        model_encoder_layers_7_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[111]
        model_encoder_layers_7_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[112]
        gv2088: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2088, R.dtype("float16"))
        _1701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_v_proj_weight, alloc1700, model_encoder_layers_7_self_attn_v_proj_bias, alloc1703)
        R.vm.kill_object(alloc1700)
        R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_bias)
        gv2089: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape58: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1703, gv2089, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1703)
        gv2090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape59: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape56, gv2090, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape56)
        gv2091: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape60: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape57, gv2091, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape57)
        gv2092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape61: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape58, gv2092, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape58)
        gv2093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2093, R.dtype("float16"))
        _1702: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape59, reshape60, reshape61, alloc1704)
        R.vm.kill_object(reshape59)
        R.vm.kill_object(reshape60)
        R.vm.kill_object(reshape61)
        gv2094: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape62: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1704, gv2094, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1704)
        gv2095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape63: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape62, gv2095, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape62)
        model_encoder_layers_7_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[115]
        model_encoder_layers_7_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[116]
        gv2096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2096, R.dtype("float16"))
        _1703: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_out_proj_weight, reshape63, model_encoder_layers_7_self_attn_out_proj_bias, alloc1705)
        R.vm.kill_object(reshape63)
        R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_bias)
        gv2097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2097, R.dtype("float16"))
        cls.add4(alloc1699, alloc1705, alloc1706)
        R.vm.kill_object(alloc1699)
        R.vm.kill_object(alloc1705)
        model_encoder_layers_7_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[123]
        model_encoder_layers_7_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[124]
        gv2098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2098, R.dtype("float16"))
        cls.layer_norm1(alloc1706, model_encoder_layers_7_final_layer_norm_weight, model_encoder_layers_7_final_layer_norm_bias, alloc1707)
        R.vm.kill_object(model_encoder_layers_7_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_7_final_layer_norm_bias)
        model_encoder_layers_7_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[119]
        model_encoder_layers_7_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[120]
        gv2099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2099, R.dtype("float16"))
        _1706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_7_fc1_weight, alloc1707, model_encoder_layers_7_fc1_bias, alloc1708)
        R.vm.kill_object(alloc1707)
        R.vm.kill_object(model_encoder_layers_7_fc1_weight)
        R.vm.kill_object(model_encoder_layers_7_fc1_bias)
        model_encoder_layers_7_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[121]
        model_encoder_layers_7_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[122]
        gv2100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2100, R.dtype("float16"))
        _1707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_7_fc2_weight, alloc1708, model_encoder_layers_7_fc2_bias, alloc1709)
        R.vm.kill_object(alloc1708)
        R.vm.kill_object(model_encoder_layers_7_fc2_weight)
        R.vm.kill_object(model_encoder_layers_7_fc2_bias)
        gv2101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1710: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2101, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1706, alloc1709, alloc1710)
        R.vm.kill_object(alloc1706)
        R.vm.kill_object(alloc1709)
        model_encoder_layers_8_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[132]
        model_encoder_layers_8_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[133]
        gv2102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2102, R.dtype("float16"))
        cls.layer_norm1(alloc1710, model_encoder_layers_8_self_attn_layer_norm_weight, model_encoder_layers_8_self_attn_layer_norm_bias, alloc1711)
        R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_bias)
        model_encoder_layers_8_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[128]
        model_encoder_layers_8_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[129]
        gv2103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2103, R.dtype("float16"))
        _1710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_q_proj_weight, alloc1711, model_encoder_layers_8_self_attn_q_proj_bias, alloc1712)
        R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_bias)
        gv2104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape64: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1712, gv2104, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1712)
        model_encoder_layers_8_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[125]
        gv2105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2105, R.dtype("float16"))
        _1711: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_8_self_attn_k_proj_weight, alloc1711, alloc1713)
        R.vm.kill_object(model_encoder_layers_8_self_attn_k_proj_weight)
        gv2106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape65: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1713, gv2106, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1713)
        model_encoder_layers_8_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[126]
        model_encoder_layers_8_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[127]
        gv2107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2107, R.dtype("float16"))
        _1712: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_v_proj_weight, alloc1711, model_encoder_layers_8_self_attn_v_proj_bias, alloc1714)
        R.vm.kill_object(alloc1711)
        R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_bias)
        gv2108: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape66: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1714, gv2108, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1714)
        gv2109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape67: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape64, gv2109, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape64)
        gv2110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape68: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape65, gv2110, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape65)
        gv2111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape69: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape66, gv2111, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape66)
        gv2112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2112, R.dtype("float16"))
        _1713: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape67, reshape68, reshape69, alloc1715)
        R.vm.kill_object(reshape67)
        R.vm.kill_object(reshape68)
        R.vm.kill_object(reshape69)
        gv2113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape70: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1715, gv2113, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1715)
        gv2114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape71: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape70, gv2114, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape70)
        model_encoder_layers_8_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[130]
        model_encoder_layers_8_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[131]
        gv2115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2115, R.dtype("float16"))
        _1714: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_out_proj_weight, reshape71, model_encoder_layers_8_self_attn_out_proj_bias, alloc1716)
        R.vm.kill_object(reshape71)
        R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_bias)
        gv2116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2116, R.dtype("float16"))
        cls.add4(alloc1710, alloc1716, alloc1717)
        R.vm.kill_object(alloc1710)
        R.vm.kill_object(alloc1716)
        model_encoder_layers_8_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[138]
        model_encoder_layers_8_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[139]
        gv2117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2117, R.dtype("float16"))
        cls.layer_norm1(alloc1717, model_encoder_layers_8_final_layer_norm_weight, model_encoder_layers_8_final_layer_norm_bias, alloc1718)
        R.vm.kill_object(model_encoder_layers_8_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_8_final_layer_norm_bias)
        model_encoder_layers_8_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[134]
        model_encoder_layers_8_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[135]
        gv2118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2118, R.dtype("float16"))
        _1717: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_8_fc1_weight, alloc1718, model_encoder_layers_8_fc1_bias, alloc1719)
        R.vm.kill_object(alloc1718)
        R.vm.kill_object(model_encoder_layers_8_fc1_weight)
        R.vm.kill_object(model_encoder_layers_8_fc1_bias)
        model_encoder_layers_8_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[136]
        model_encoder_layers_8_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[137]
        gv2119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2119, R.dtype("float16"))
        _1718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_8_fc2_weight, alloc1719, model_encoder_layers_8_fc2_bias, alloc1720)
        R.vm.kill_object(alloc1719)
        R.vm.kill_object(model_encoder_layers_8_fc2_weight)
        R.vm.kill_object(model_encoder_layers_8_fc2_bias)
        gv2120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2120, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1717, alloc1720, alloc1721)
        R.vm.kill_object(alloc1717)
        R.vm.kill_object(alloc1720)
        model_encoder_layers_9_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[147]
        model_encoder_layers_9_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[148]
        gv2121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2121, R.dtype("float16"))
        cls.layer_norm1(alloc1721, model_encoder_layers_9_self_attn_layer_norm_weight, model_encoder_layers_9_self_attn_layer_norm_bias, alloc1722)
        R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_bias)
        model_encoder_layers_9_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[143]
        model_encoder_layers_9_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[144]
        gv2122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2122, R.dtype("float16"))
        _1721: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_q_proj_weight, alloc1722, model_encoder_layers_9_self_attn_q_proj_bias, alloc1723)
        R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_bias)
        gv2123: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape72: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1723, gv2123, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1723)
        model_encoder_layers_9_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[140]
        gv2124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2124, R.dtype("float16"))
        _1722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_9_self_attn_k_proj_weight, alloc1722, alloc1724)
        R.vm.kill_object(model_encoder_layers_9_self_attn_k_proj_weight)
        gv2125: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape73: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1724, gv2125, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1724)
        model_encoder_layers_9_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[141]
        model_encoder_layers_9_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[142]
        gv2126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2126, R.dtype("float16"))
        _1723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_v_proj_weight, alloc1722, model_encoder_layers_9_self_attn_v_proj_bias, alloc1725)
        R.vm.kill_object(alloc1722)
        R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_bias)
        gv2127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape74: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1725, gv2127, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1725)
        gv2128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape75: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape72, gv2128, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape72)
        gv2129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape76: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape73, gv2129, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape73)
        gv2130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape77: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape74, gv2130, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape74)
        gv2131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2131, R.dtype("float16"))
        _1724: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape75, reshape76, reshape77, alloc1726)
        R.vm.kill_object(reshape75)
        R.vm.kill_object(reshape76)
        R.vm.kill_object(reshape77)
        gv2132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape78: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1726, gv2132, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1726)
        gv2133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape79: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape78, gv2133, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape78)
        model_encoder_layers_9_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[145]
        model_encoder_layers_9_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[146]
        gv2134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1727: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2134, R.dtype("float16"))
        _1725: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_out_proj_weight, reshape79, model_encoder_layers_9_self_attn_out_proj_bias, alloc1727)
        R.vm.kill_object(reshape79)
        R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_bias)
        gv2135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2135, R.dtype("float16"))
        cls.add4(alloc1721, alloc1727, alloc1728)
        R.vm.kill_object(alloc1721)
        R.vm.kill_object(alloc1727)
        model_encoder_layers_9_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[153]
        model_encoder_layers_9_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[154]
        gv2136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2136, R.dtype("float16"))
        cls.layer_norm1(alloc1728, model_encoder_layers_9_final_layer_norm_weight, model_encoder_layers_9_final_layer_norm_bias, alloc1729)
        R.vm.kill_object(model_encoder_layers_9_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_9_final_layer_norm_bias)
        model_encoder_layers_9_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[149]
        model_encoder_layers_9_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[150]
        gv2137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2137, R.dtype("float16"))
        _1728: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_9_fc1_weight, alloc1729, model_encoder_layers_9_fc1_bias, alloc1730)
        R.vm.kill_object(alloc1729)
        R.vm.kill_object(model_encoder_layers_9_fc1_weight)
        R.vm.kill_object(model_encoder_layers_9_fc1_bias)
        model_encoder_layers_9_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[151]
        model_encoder_layers_9_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[152]
        gv2138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2138, R.dtype("float16"))
        _1729: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_9_fc2_weight, alloc1730, model_encoder_layers_9_fc2_bias, alloc1731)
        R.vm.kill_object(alloc1730)
        R.vm.kill_object(model_encoder_layers_9_fc2_weight)
        R.vm.kill_object(model_encoder_layers_9_fc2_bias)
        gv2139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2139, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1728, alloc1731, alloc1732)
        R.vm.kill_object(alloc1728)
        R.vm.kill_object(alloc1731)
        model_encoder_layers_10_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[162]
        model_encoder_layers_10_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[163]
        gv2140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2140, R.dtype("float16"))
        cls.layer_norm1(alloc1732, model_encoder_layers_10_self_attn_layer_norm_weight, model_encoder_layers_10_self_attn_layer_norm_bias, alloc1733)
        R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_bias)
        model_encoder_layers_10_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[158]
        model_encoder_layers_10_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[159]
        gv2141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2141, R.dtype("float16"))
        _1732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_q_proj_weight, alloc1733, model_encoder_layers_10_self_attn_q_proj_bias, alloc1734)
        R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_bias)
        gv2142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape80: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1734, gv2142, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1734)
        model_encoder_layers_10_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[155]
        gv2143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2143, R.dtype("float16"))
        _1733: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_10_self_attn_k_proj_weight, alloc1733, alloc1735)
        R.vm.kill_object(model_encoder_layers_10_self_attn_k_proj_weight)
        gv2144: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape81: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1735, gv2144, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1735)
        model_encoder_layers_10_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[156]
        model_encoder_layers_10_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[157]
        gv2145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2145, R.dtype("float16"))
        _1734: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_v_proj_weight, alloc1733, model_encoder_layers_10_self_attn_v_proj_bias, alloc1736)
        R.vm.kill_object(alloc1733)
        R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_bias)
        gv2146: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape82: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1736, gv2146, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1736)
        gv2147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape83: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape80, gv2147, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape80)
        gv2148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape84: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape81, gv2148, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape81)
        gv2149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape85: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape82, gv2149, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape82)
        gv2150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2150, R.dtype("float16"))
        _1735: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape83, reshape84, reshape85, alloc1737)
        R.vm.kill_object(reshape83)
        R.vm.kill_object(reshape84)
        R.vm.kill_object(reshape85)
        gv2151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape86: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1737, gv2151, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1737)
        gv2152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape87: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape86, gv2152, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape86)
        model_encoder_layers_10_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[160]
        model_encoder_layers_10_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[161]
        gv2153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2153, R.dtype("float16"))
        _1736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_out_proj_weight, reshape87, model_encoder_layers_10_self_attn_out_proj_bias, alloc1738)
        R.vm.kill_object(reshape87)
        R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_bias)
        gv2154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2154, R.dtype("float16"))
        cls.add4(alloc1732, alloc1738, alloc1739)
        R.vm.kill_object(alloc1732)
        R.vm.kill_object(alloc1738)
        model_encoder_layers_10_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[168]
        model_encoder_layers_10_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[169]
        gv2155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2155, R.dtype("float16"))
        cls.layer_norm1(alloc1739, model_encoder_layers_10_final_layer_norm_weight, model_encoder_layers_10_final_layer_norm_bias, alloc1740)
        R.vm.kill_object(model_encoder_layers_10_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_10_final_layer_norm_bias)
        model_encoder_layers_10_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[164]
        model_encoder_layers_10_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[165]
        gv2156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2156, R.dtype("float16"))
        _1739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_10_fc1_weight, alloc1740, model_encoder_layers_10_fc1_bias, alloc1741)
        R.vm.kill_object(alloc1740)
        R.vm.kill_object(model_encoder_layers_10_fc1_weight)
        R.vm.kill_object(model_encoder_layers_10_fc1_bias)
        model_encoder_layers_10_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[166]
        model_encoder_layers_10_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[167]
        gv2157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2157, R.dtype("float16"))
        _1740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_10_fc2_weight, alloc1741, model_encoder_layers_10_fc2_bias, alloc1742)
        R.vm.kill_object(alloc1741)
        R.vm.kill_object(model_encoder_layers_10_fc2_weight)
        R.vm.kill_object(model_encoder_layers_10_fc2_bias)
        gv2158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2158, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1739, alloc1742, alloc1743)
        R.vm.kill_object(alloc1739)
        R.vm.kill_object(alloc1742)
        model_encoder_layers_11_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[177]
        model_encoder_layers_11_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[178]
        gv2159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1744: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2159, R.dtype("float16"))
        cls.layer_norm1(alloc1743, model_encoder_layers_11_self_attn_layer_norm_weight, model_encoder_layers_11_self_attn_layer_norm_bias, alloc1744)
        R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_bias)
        model_encoder_layers_11_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[173]
        model_encoder_layers_11_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[174]
        gv2160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2160, R.dtype("float16"))
        _1743: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_q_proj_weight, alloc1744, model_encoder_layers_11_self_attn_q_proj_bias, alloc1745)
        R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_bias)
        gv2161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape88: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1745, gv2161, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1745)
        model_encoder_layers_11_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[170]
        gv2162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2162, R.dtype("float16"))
        _1744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_11_self_attn_k_proj_weight, alloc1744, alloc1746)
        R.vm.kill_object(model_encoder_layers_11_self_attn_k_proj_weight)
        gv2163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape89: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1746, gv2163, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1746)
        model_encoder_layers_11_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[171]
        model_encoder_layers_11_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[172]
        gv2164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2164, R.dtype("float16"))
        _1745: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_v_proj_weight, alloc1744, model_encoder_layers_11_self_attn_v_proj_bias, alloc1747)
        R.vm.kill_object(alloc1744)
        R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_bias)
        gv2165: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape90: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1747, gv2165, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1747)
        gv2166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape91: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape88, gv2166, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape88)
        gv2167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape92: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape89, gv2167, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape89)
        gv2168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape93: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape90, gv2168, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape90)
        gv2169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2169, R.dtype("float16"))
        _1746: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape91, reshape92, reshape93, alloc1748)
        R.vm.kill_object(reshape91)
        R.vm.kill_object(reshape92)
        R.vm.kill_object(reshape93)
        gv2170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape94: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1748, gv2170, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1748)
        gv2171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape95: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape94, gv2171, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape94)
        model_encoder_layers_11_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[175]
        model_encoder_layers_11_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[176]
        gv2172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2172, R.dtype("float16"))
        _1747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_out_proj_weight, reshape95, model_encoder_layers_11_self_attn_out_proj_bias, alloc1749)
        R.vm.kill_object(reshape95)
        R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_bias)
        gv2173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2173, R.dtype("float16"))
        cls.add4(alloc1743, alloc1749, alloc1750)
        R.vm.kill_object(alloc1743)
        R.vm.kill_object(alloc1749)
        model_encoder_layers_11_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[183]
        model_encoder_layers_11_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[184]
        gv2174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2174, R.dtype("float16"))
        cls.layer_norm1(alloc1750, model_encoder_layers_11_final_layer_norm_weight, model_encoder_layers_11_final_layer_norm_bias, alloc1751)
        R.vm.kill_object(model_encoder_layers_11_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_11_final_layer_norm_bias)
        model_encoder_layers_11_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[179]
        model_encoder_layers_11_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[180]
        gv2175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2175, R.dtype("float16"))
        _1750: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_11_fc1_weight, alloc1751, model_encoder_layers_11_fc1_bias, alloc1752)
        R.vm.kill_object(alloc1751)
        R.vm.kill_object(model_encoder_layers_11_fc1_weight)
        R.vm.kill_object(model_encoder_layers_11_fc1_bias)
        model_encoder_layers_11_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[181]
        model_encoder_layers_11_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[182]
        gv2176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2176, R.dtype("float16"))
        _1751: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_11_fc2_weight, alloc1752, model_encoder_layers_11_fc2_bias, alloc1753)
        R.vm.kill_object(alloc1752)
        R.vm.kill_object(model_encoder_layers_11_fc2_weight)
        R.vm.kill_object(model_encoder_layers_11_fc2_bias)
        gv2177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2177, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1750, alloc1753, alloc1754)
        R.vm.kill_object(alloc1750)
        R.vm.kill_object(alloc1753)
        model_encoder_layers_12_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[192]
        model_encoder_layers_12_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[193]
        gv2178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2178, R.dtype("float16"))
        cls.layer_norm1(alloc1754, model_encoder_layers_12_self_attn_layer_norm_weight, model_encoder_layers_12_self_attn_layer_norm_bias, alloc1755)
        R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_bias)
        model_encoder_layers_12_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[188]
        model_encoder_layers_12_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[189]
        gv2179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2179, R.dtype("float16"))
        _1754: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_q_proj_weight, alloc1755, model_encoder_layers_12_self_attn_q_proj_bias, alloc1756)
        R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_bias)
        gv2180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape96: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1756, gv2180, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1756)
        model_encoder_layers_12_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[185]
        gv2181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2181, R.dtype("float16"))
        _1755: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_12_self_attn_k_proj_weight, alloc1755, alloc1757)
        R.vm.kill_object(model_encoder_layers_12_self_attn_k_proj_weight)
        gv2182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape97: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1757, gv2182, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1757)
        model_encoder_layers_12_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[186]
        model_encoder_layers_12_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[187]
        gv2183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2183, R.dtype("float16"))
        _1756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_v_proj_weight, alloc1755, model_encoder_layers_12_self_attn_v_proj_bias, alloc1758)
        R.vm.kill_object(alloc1755)
        R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_bias)
        gv2184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape98: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1758, gv2184, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1758)
        gv2185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape99: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape96, gv2185, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape96)
        gv2186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape100: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape97, gv2186, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape97)
        gv2187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape101: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape98, gv2187, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape98)
        gv2188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2188, R.dtype("float16"))
        _1757: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape99, reshape100, reshape101, alloc1759)
        R.vm.kill_object(reshape99)
        R.vm.kill_object(reshape100)
        R.vm.kill_object(reshape101)
        gv2189: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape102: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1759, gv2189, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1759)
        gv2190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape103: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape102, gv2190, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape102)
        model_encoder_layers_12_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[190]
        model_encoder_layers_12_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[191]
        gv2191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2191, R.dtype("float16"))
        _1758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_out_proj_weight, reshape103, model_encoder_layers_12_self_attn_out_proj_bias, alloc1760)
        R.vm.kill_object(reshape103)
        R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_bias)
        gv2192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1761: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2192, R.dtype("float16"))
        cls.add4(alloc1754, alloc1760, alloc1761)
        R.vm.kill_object(alloc1754)
        R.vm.kill_object(alloc1760)
        model_encoder_layers_12_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[198]
        model_encoder_layers_12_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[199]
        gv2193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2193, R.dtype("float16"))
        cls.layer_norm1(alloc1761, model_encoder_layers_12_final_layer_norm_weight, model_encoder_layers_12_final_layer_norm_bias, alloc1762)
        R.vm.kill_object(model_encoder_layers_12_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_12_final_layer_norm_bias)
        model_encoder_layers_12_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[194]
        model_encoder_layers_12_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[195]
        gv2194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2194, R.dtype("float16"))
        _1761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_12_fc1_weight, alloc1762, model_encoder_layers_12_fc1_bias, alloc1763)
        R.vm.kill_object(alloc1762)
        R.vm.kill_object(model_encoder_layers_12_fc1_weight)
        R.vm.kill_object(model_encoder_layers_12_fc1_bias)
        model_encoder_layers_12_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[196]
        model_encoder_layers_12_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[197]
        gv2195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2195, R.dtype("float16"))
        _1762: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_12_fc2_weight, alloc1763, model_encoder_layers_12_fc2_bias, alloc1764)
        R.vm.kill_object(alloc1763)
        R.vm.kill_object(model_encoder_layers_12_fc2_weight)
        R.vm.kill_object(model_encoder_layers_12_fc2_bias)
        gv2196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2196, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1761, alloc1764, alloc1765)
        R.vm.kill_object(alloc1761)
        R.vm.kill_object(alloc1764)
        model_encoder_layers_13_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[207]
        model_encoder_layers_13_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[208]
        gv2197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2197, R.dtype("float16"))
        cls.layer_norm1(alloc1765, model_encoder_layers_13_self_attn_layer_norm_weight, model_encoder_layers_13_self_attn_layer_norm_bias, alloc1766)
        R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_bias)
        model_encoder_layers_13_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[203]
        model_encoder_layers_13_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[204]
        gv2198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2198, R.dtype("float16"))
        _1765: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_q_proj_weight, alloc1766, model_encoder_layers_13_self_attn_q_proj_bias, alloc1767)
        R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_bias)
        gv2199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape104: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1767, gv2199, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1767)
        model_encoder_layers_13_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[200]
        gv2200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2200, R.dtype("float16"))
        _1766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_13_self_attn_k_proj_weight, alloc1766, alloc1768)
        R.vm.kill_object(model_encoder_layers_13_self_attn_k_proj_weight)
        gv2201: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape105: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1768, gv2201, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1768)
        model_encoder_layers_13_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[201]
        model_encoder_layers_13_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[202]
        gv2202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2202, R.dtype("float16"))
        _1767: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_v_proj_weight, alloc1766, model_encoder_layers_13_self_attn_v_proj_bias, alloc1769)
        R.vm.kill_object(alloc1766)
        R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_bias)
        gv2203: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape106: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1769, gv2203, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1769)
        gv2204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape107: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape104, gv2204, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape104)
        gv2205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape108: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape105, gv2205, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape105)
        gv2206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape109: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape106, gv2206, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape106)
        gv2207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2207, R.dtype("float16"))
        _1768: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape107, reshape108, reshape109, alloc1770)
        R.vm.kill_object(reshape107)
        R.vm.kill_object(reshape108)
        R.vm.kill_object(reshape109)
        gv2208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape110: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1770, gv2208, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1770)
        gv2209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape111: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape110, gv2209, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape110)
        model_encoder_layers_13_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[205]
        model_encoder_layers_13_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[206]
        gv2210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2210, R.dtype("float16"))
        _1769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_out_proj_weight, reshape111, model_encoder_layers_13_self_attn_out_proj_bias, alloc1771)
        R.vm.kill_object(reshape111)
        R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_bias)
        gv2211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2211, R.dtype("float16"))
        cls.add4(alloc1765, alloc1771, alloc1772)
        R.vm.kill_object(alloc1765)
        R.vm.kill_object(alloc1771)
        model_encoder_layers_13_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[213]
        model_encoder_layers_13_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[214]
        gv2212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2212, R.dtype("float16"))
        cls.layer_norm1(alloc1772, model_encoder_layers_13_final_layer_norm_weight, model_encoder_layers_13_final_layer_norm_bias, alloc1773)
        R.vm.kill_object(model_encoder_layers_13_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_13_final_layer_norm_bias)
        model_encoder_layers_13_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[209]
        model_encoder_layers_13_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[210]
        gv2213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2213, R.dtype("float16"))
        _1772: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_13_fc1_weight, alloc1773, model_encoder_layers_13_fc1_bias, alloc1774)
        R.vm.kill_object(alloc1773)
        R.vm.kill_object(model_encoder_layers_13_fc1_weight)
        R.vm.kill_object(model_encoder_layers_13_fc1_bias)
        model_encoder_layers_13_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[211]
        model_encoder_layers_13_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[212]
        gv2214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2214, R.dtype("float16"))
        _1773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_13_fc2_weight, alloc1774, model_encoder_layers_13_fc2_bias, alloc1775)
        R.vm.kill_object(alloc1774)
        R.vm.kill_object(model_encoder_layers_13_fc2_weight)
        R.vm.kill_object(model_encoder_layers_13_fc2_bias)
        gv2215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2215, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1772, alloc1775, alloc1776)
        R.vm.kill_object(alloc1772)
        R.vm.kill_object(alloc1775)
        model_encoder_layers_14_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[222]
        model_encoder_layers_14_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[223]
        gv2216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2216, R.dtype("float16"))
        cls.layer_norm1(alloc1776, model_encoder_layers_14_self_attn_layer_norm_weight, model_encoder_layers_14_self_attn_layer_norm_bias, alloc1777)
        R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_bias)
        model_encoder_layers_14_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[218]
        model_encoder_layers_14_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[219]
        gv2217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1778: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2217, R.dtype("float16"))
        _1776: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_q_proj_weight, alloc1777, model_encoder_layers_14_self_attn_q_proj_bias, alloc1778)
        R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_bias)
        gv2218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape112: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1778, gv2218, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1778)
        model_encoder_layers_14_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[215]
        gv2219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2219, R.dtype("float16"))
        _1777: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_14_self_attn_k_proj_weight, alloc1777, alloc1779)
        R.vm.kill_object(model_encoder_layers_14_self_attn_k_proj_weight)
        gv2220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape113: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1779, gv2220, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1779)
        model_encoder_layers_14_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[216]
        model_encoder_layers_14_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[217]
        gv2221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2221, R.dtype("float16"))
        _1778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_v_proj_weight, alloc1777, model_encoder_layers_14_self_attn_v_proj_bias, alloc1780)
        R.vm.kill_object(alloc1777)
        R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_bias)
        gv2222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape114: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1780, gv2222, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1780)
        gv2223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape115: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape112, gv2223, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape112)
        gv2224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape116: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape113, gv2224, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape113)
        gv2225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape117: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape114, gv2225, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape114)
        gv2226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2226, R.dtype("float16"))
        _1779: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape115, reshape116, reshape117, alloc1781)
        R.vm.kill_object(reshape115)
        R.vm.kill_object(reshape116)
        R.vm.kill_object(reshape117)
        gv2227: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape118: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1781, gv2227, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1781)
        gv2228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape119: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape118, gv2228, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape118)
        model_encoder_layers_14_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[220]
        model_encoder_layers_14_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[221]
        gv2229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2229, R.dtype("float16"))
        _1780: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_out_proj_weight, reshape119, model_encoder_layers_14_self_attn_out_proj_bias, alloc1782)
        R.vm.kill_object(reshape119)
        R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_bias)
        gv2230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2230, R.dtype("float16"))
        cls.add4(alloc1776, alloc1782, alloc1783)
        R.vm.kill_object(alloc1776)
        R.vm.kill_object(alloc1782)
        model_encoder_layers_14_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[228]
        model_encoder_layers_14_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[229]
        gv2231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2231, R.dtype("float16"))
        cls.layer_norm1(alloc1783, model_encoder_layers_14_final_layer_norm_weight, model_encoder_layers_14_final_layer_norm_bias, alloc1784)
        R.vm.kill_object(model_encoder_layers_14_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_14_final_layer_norm_bias)
        model_encoder_layers_14_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[224]
        model_encoder_layers_14_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[225]
        gv2232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2232, R.dtype("float16"))
        _1783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_14_fc1_weight, alloc1784, model_encoder_layers_14_fc1_bias, alloc1785)
        R.vm.kill_object(alloc1784)
        R.vm.kill_object(model_encoder_layers_14_fc1_weight)
        R.vm.kill_object(model_encoder_layers_14_fc1_bias)
        model_encoder_layers_14_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[226]
        model_encoder_layers_14_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[227]
        gv2233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2233, R.dtype("float16"))
        _1784: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_14_fc2_weight, alloc1785, model_encoder_layers_14_fc2_bias, alloc1786)
        R.vm.kill_object(alloc1785)
        R.vm.kill_object(model_encoder_layers_14_fc2_weight)
        R.vm.kill_object(model_encoder_layers_14_fc2_bias)
        gv2234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2234, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1783, alloc1786, alloc1787)
        R.vm.kill_object(alloc1783)
        R.vm.kill_object(alloc1786)
        model_encoder_layers_15_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[237]
        model_encoder_layers_15_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[238]
        gv2235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2235, R.dtype("float16"))
        cls.layer_norm1(alloc1787, model_encoder_layers_15_self_attn_layer_norm_weight, model_encoder_layers_15_self_attn_layer_norm_bias, alloc1788)
        R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_bias)
        model_encoder_layers_15_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[233]
        model_encoder_layers_15_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[234]
        gv2236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2236, R.dtype("float16"))
        _1787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_q_proj_weight, alloc1788, model_encoder_layers_15_self_attn_q_proj_bias, alloc1789)
        R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_bias)
        gv2237: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape120: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1789, gv2237, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1789)
        model_encoder_layers_15_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[230]
        gv2238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2238, R.dtype("float16"))
        _1788: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_15_self_attn_k_proj_weight, alloc1788, alloc1790)
        R.vm.kill_object(model_encoder_layers_15_self_attn_k_proj_weight)
        gv2239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape121: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1790, gv2239, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1790)
        model_encoder_layers_15_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[231]
        model_encoder_layers_15_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[232]
        gv2240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2240, R.dtype("float16"))
        _1789: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_v_proj_weight, alloc1788, model_encoder_layers_15_self_attn_v_proj_bias, alloc1791)
        R.vm.kill_object(alloc1788)
        R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_bias)
        gv2241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape122: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1791, gv2241, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1791)
        gv2242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape123: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape120, gv2242, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape120)
        gv2243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape124: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape121, gv2243, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape121)
        gv2244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape125: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape122, gv2244, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape122)
        gv2245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2245, R.dtype("float16"))
        _1790: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape123, reshape124, reshape125, alloc1792)
        R.vm.kill_object(reshape123)
        R.vm.kill_object(reshape124)
        R.vm.kill_object(reshape125)
        gv2246: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape126: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1792, gv2246, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1792)
        gv2247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape127: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape126, gv2247, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape126)
        model_encoder_layers_15_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[235]
        model_encoder_layers_15_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[236]
        gv2248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2248, R.dtype("float16"))
        _1791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_out_proj_weight, reshape127, model_encoder_layers_15_self_attn_out_proj_bias, alloc1793)
        R.vm.kill_object(reshape127)
        R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_bias)
        gv2249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2249, R.dtype("float16"))
        cls.add4(alloc1787, alloc1793, alloc1794)
        R.vm.kill_object(alloc1787)
        R.vm.kill_object(alloc1793)
        model_encoder_layers_15_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[243]
        model_encoder_layers_15_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[244]
        gv2250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1795: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2250, R.dtype("float16"))
        cls.layer_norm1(alloc1794, model_encoder_layers_15_final_layer_norm_weight, model_encoder_layers_15_final_layer_norm_bias, alloc1795)
        R.vm.kill_object(model_encoder_layers_15_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_15_final_layer_norm_bias)
        model_encoder_layers_15_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[239]
        model_encoder_layers_15_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[240]
        gv2251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2251, R.dtype("float16"))
        _1794: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_15_fc1_weight, alloc1795, model_encoder_layers_15_fc1_bias, alloc1796)
        R.vm.kill_object(alloc1795)
        R.vm.kill_object(model_encoder_layers_15_fc1_weight)
        R.vm.kill_object(model_encoder_layers_15_fc1_bias)
        model_encoder_layers_15_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[241]
        model_encoder_layers_15_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[242]
        gv2252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2252, R.dtype("float16"))
        _1795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_15_fc2_weight, alloc1796, model_encoder_layers_15_fc2_bias, alloc1797)
        R.vm.kill_object(alloc1796)
        R.vm.kill_object(model_encoder_layers_15_fc2_weight)
        R.vm.kill_object(model_encoder_layers_15_fc2_bias)
        gv2253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2253, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1794, alloc1797, alloc1798)
        R.vm.kill_object(alloc1794)
        R.vm.kill_object(alloc1797)
        model_encoder_layers_16_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[252]
        model_encoder_layers_16_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[253]
        gv2254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2254, R.dtype("float16"))
        cls.layer_norm1(alloc1798, model_encoder_layers_16_self_attn_layer_norm_weight, model_encoder_layers_16_self_attn_layer_norm_bias, alloc1799)
        R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_bias)
        model_encoder_layers_16_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[248]
        model_encoder_layers_16_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[249]
        gv2255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2255, R.dtype("float16"))
        _1798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_q_proj_weight, alloc1799, model_encoder_layers_16_self_attn_q_proj_bias, alloc1800)
        R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_bias)
        gv2256: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape128: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1800, gv2256, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1800)
        model_encoder_layers_16_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[245]
        gv2257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2257, R.dtype("float16"))
        _1799: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_16_self_attn_k_proj_weight, alloc1799, alloc1801)
        R.vm.kill_object(model_encoder_layers_16_self_attn_k_proj_weight)
        gv2258: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape129: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1801, gv2258, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1801)
        model_encoder_layers_16_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[246]
        model_encoder_layers_16_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[247]
        gv2259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2259, R.dtype("float16"))
        _1800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_v_proj_weight, alloc1799, model_encoder_layers_16_self_attn_v_proj_bias, alloc1802)
        R.vm.kill_object(alloc1799)
        R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_bias)
        gv2260: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape130: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1802, gv2260, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1802)
        gv2261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape131: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape128, gv2261, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape128)
        gv2262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape132: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape129, gv2262, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape129)
        gv2263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape133: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape130, gv2263, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape130)
        gv2264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2264, R.dtype("float16"))
        _1801: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape131, reshape132, reshape133, alloc1803)
        R.vm.kill_object(reshape131)
        R.vm.kill_object(reshape132)
        R.vm.kill_object(reshape133)
        gv2265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape134: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1803, gv2265, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1803)
        gv2266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape135: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape134, gv2266, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape134)
        model_encoder_layers_16_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[250]
        model_encoder_layers_16_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[251]
        gv2267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2267, R.dtype("float16"))
        _1802: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_out_proj_weight, reshape135, model_encoder_layers_16_self_attn_out_proj_bias, alloc1804)
        R.vm.kill_object(reshape135)
        R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_bias)
        gv2268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2268, R.dtype("float16"))
        cls.add4(alloc1798, alloc1804, alloc1805)
        R.vm.kill_object(alloc1798)
        R.vm.kill_object(alloc1804)
        model_encoder_layers_16_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[258]
        model_encoder_layers_16_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[259]
        gv2269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2269, R.dtype("float16"))
        cls.layer_norm1(alloc1805, model_encoder_layers_16_final_layer_norm_weight, model_encoder_layers_16_final_layer_norm_bias, alloc1806)
        R.vm.kill_object(model_encoder_layers_16_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_16_final_layer_norm_bias)
        model_encoder_layers_16_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[254]
        model_encoder_layers_16_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[255]
        gv2270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2270, R.dtype("float16"))
        _1805: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_16_fc1_weight, alloc1806, model_encoder_layers_16_fc1_bias, alloc1807)
        R.vm.kill_object(alloc1806)
        R.vm.kill_object(model_encoder_layers_16_fc1_weight)
        R.vm.kill_object(model_encoder_layers_16_fc1_bias)
        model_encoder_layers_16_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[256]
        model_encoder_layers_16_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[257]
        gv2271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2271, R.dtype("float16"))
        _1806: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_16_fc2_weight, alloc1807, model_encoder_layers_16_fc2_bias, alloc1808)
        R.vm.kill_object(alloc1807)
        R.vm.kill_object(model_encoder_layers_16_fc2_weight)
        R.vm.kill_object(model_encoder_layers_16_fc2_bias)
        gv2272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2272, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1805, alloc1808, alloc1809)
        R.vm.kill_object(alloc1805)
        R.vm.kill_object(alloc1808)
        model_encoder_layers_17_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[267]
        model_encoder_layers_17_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[268]
        gv2273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2273, R.dtype("float16"))
        cls.layer_norm1(alloc1809, model_encoder_layers_17_self_attn_layer_norm_weight, model_encoder_layers_17_self_attn_layer_norm_bias, alloc1810)
        R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_bias)
        model_encoder_layers_17_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[263]
        model_encoder_layers_17_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[264]
        gv2274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2274, R.dtype("float16"))
        _1809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_q_proj_weight, alloc1810, model_encoder_layers_17_self_attn_q_proj_bias, alloc1811)
        R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_bias)
        gv2275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape136: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1811, gv2275, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1811)
        model_encoder_layers_17_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[260]
        gv2276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1812: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2276, R.dtype("float16"))
        _1810: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_17_self_attn_k_proj_weight, alloc1810, alloc1812)
        R.vm.kill_object(model_encoder_layers_17_self_attn_k_proj_weight)
        gv2277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape137: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1812, gv2277, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1812)
        model_encoder_layers_17_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[261]
        model_encoder_layers_17_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[262]
        gv2278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2278, R.dtype("float16"))
        _1811: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_v_proj_weight, alloc1810, model_encoder_layers_17_self_attn_v_proj_bias, alloc1813)
        R.vm.kill_object(alloc1810)
        R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_bias)
        gv2279: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape138: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1813, gv2279, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1813)
        gv2280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape139: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape136, gv2280, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape136)
        gv2281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape140: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape137, gv2281, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape137)
        gv2282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape141: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape138, gv2282, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape138)
        gv2283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2283, R.dtype("float16"))
        _1812: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape139, reshape140, reshape141, alloc1814)
        R.vm.kill_object(reshape139)
        R.vm.kill_object(reshape140)
        R.vm.kill_object(reshape141)
        gv2284: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape142: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1814, gv2284, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1814)
        gv2285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape143: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape142, gv2285, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape142)
        model_encoder_layers_17_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[265]
        model_encoder_layers_17_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[266]
        gv2286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2286, R.dtype("float16"))
        _1813: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_out_proj_weight, reshape143, model_encoder_layers_17_self_attn_out_proj_bias, alloc1815)
        R.vm.kill_object(reshape143)
        R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_bias)
        gv2287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2287, R.dtype("float16"))
        cls.add4(alloc1809, alloc1815, alloc1816)
        R.vm.kill_object(alloc1809)
        R.vm.kill_object(alloc1815)
        model_encoder_layers_17_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[273]
        model_encoder_layers_17_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[274]
        gv2288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2288, R.dtype("float16"))
        cls.layer_norm1(alloc1816, model_encoder_layers_17_final_layer_norm_weight, model_encoder_layers_17_final_layer_norm_bias, alloc1817)
        R.vm.kill_object(model_encoder_layers_17_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_17_final_layer_norm_bias)
        model_encoder_layers_17_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[269]
        model_encoder_layers_17_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[270]
        gv2289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2289, R.dtype("float16"))
        _1816: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_17_fc1_weight, alloc1817, model_encoder_layers_17_fc1_bias, alloc1818)
        R.vm.kill_object(alloc1817)
        R.vm.kill_object(model_encoder_layers_17_fc1_weight)
        R.vm.kill_object(model_encoder_layers_17_fc1_bias)
        model_encoder_layers_17_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[271]
        model_encoder_layers_17_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[272]
        gv2290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2290, R.dtype("float16"))
        _1817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_17_fc2_weight, alloc1818, model_encoder_layers_17_fc2_bias, alloc1819)
        R.vm.kill_object(alloc1818)
        R.vm.kill_object(model_encoder_layers_17_fc2_weight)
        R.vm.kill_object(model_encoder_layers_17_fc2_bias)
        gv2291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2291, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1816, alloc1819, alloc1820)
        R.vm.kill_object(alloc1816)
        R.vm.kill_object(alloc1819)
        model_encoder_layers_18_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[282]
        model_encoder_layers_18_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[283]
        gv2292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2292, R.dtype("float16"))
        cls.layer_norm1(alloc1820, model_encoder_layers_18_self_attn_layer_norm_weight, model_encoder_layers_18_self_attn_layer_norm_bias, alloc1821)
        R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_bias)
        model_encoder_layers_18_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[278]
        model_encoder_layers_18_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[279]
        gv2293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2293, R.dtype("float16"))
        _1820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_q_proj_weight, alloc1821, model_encoder_layers_18_self_attn_q_proj_bias, alloc1822)
        R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_bias)
        gv2294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape144: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1822, gv2294, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1822)
        model_encoder_layers_18_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[275]
        gv2295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2295, R.dtype("float16"))
        _1821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_18_self_attn_k_proj_weight, alloc1821, alloc1823)
        R.vm.kill_object(model_encoder_layers_18_self_attn_k_proj_weight)
        gv2296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape145: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1823, gv2296, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1823)
        model_encoder_layers_18_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[276]
        model_encoder_layers_18_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[277]
        gv2297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2297, R.dtype("float16"))
        _1822: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_v_proj_weight, alloc1821, model_encoder_layers_18_self_attn_v_proj_bias, alloc1824)
        R.vm.kill_object(alloc1821)
        R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_bias)
        gv2298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape146: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1824, gv2298, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1824)
        gv2299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape147: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape144, gv2299, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape144)
        gv2300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape148: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape145, gv2300, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape145)
        gv2301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape149: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape146, gv2301, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape146)
        gv2302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2302, R.dtype("float16"))
        _1823: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape147, reshape148, reshape149, alloc1825)
        R.vm.kill_object(reshape147)
        R.vm.kill_object(reshape148)
        R.vm.kill_object(reshape149)
        gv2303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape150: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1825, gv2303, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1825)
        gv2304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape151: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape150, gv2304, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape150)
        model_encoder_layers_18_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[280]
        model_encoder_layers_18_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[281]
        gv2305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2305, R.dtype("float16"))
        _1824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_out_proj_weight, reshape151, model_encoder_layers_18_self_attn_out_proj_bias, alloc1826)
        R.vm.kill_object(reshape151)
        R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_bias)
        gv2306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2306, R.dtype("float16"))
        cls.add4(alloc1820, alloc1826, alloc1827)
        R.vm.kill_object(alloc1820)
        R.vm.kill_object(alloc1826)
        model_encoder_layers_18_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[288]
        model_encoder_layers_18_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[289]
        gv2307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2307, R.dtype("float16"))
        cls.layer_norm1(alloc1827, model_encoder_layers_18_final_layer_norm_weight, model_encoder_layers_18_final_layer_norm_bias, alloc1828)
        R.vm.kill_object(model_encoder_layers_18_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_18_final_layer_norm_bias)
        model_encoder_layers_18_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[284]
        model_encoder_layers_18_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[285]
        gv2308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1829: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2308, R.dtype("float16"))
        _1827: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_18_fc1_weight, alloc1828, model_encoder_layers_18_fc1_bias, alloc1829)
        R.vm.kill_object(alloc1828)
        R.vm.kill_object(model_encoder_layers_18_fc1_weight)
        R.vm.kill_object(model_encoder_layers_18_fc1_bias)
        model_encoder_layers_18_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[286]
        model_encoder_layers_18_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[287]
        gv2309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2309, R.dtype("float16"))
        _1828: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_18_fc2_weight, alloc1829, model_encoder_layers_18_fc2_bias, alloc1830)
        R.vm.kill_object(alloc1829)
        R.vm.kill_object(model_encoder_layers_18_fc2_weight)
        R.vm.kill_object(model_encoder_layers_18_fc2_bias)
        gv2310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2310, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1827, alloc1830, alloc1831)
        R.vm.kill_object(alloc1827)
        R.vm.kill_object(alloc1830)
        model_encoder_layers_19_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[297]
        model_encoder_layers_19_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[298]
        gv2311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2311, R.dtype("float16"))
        cls.layer_norm1(alloc1831, model_encoder_layers_19_self_attn_layer_norm_weight, model_encoder_layers_19_self_attn_layer_norm_bias, alloc1832)
        R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_bias)
        model_encoder_layers_19_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[293]
        model_encoder_layers_19_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[294]
        gv2312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2312, R.dtype("float16"))
        _1831: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_q_proj_weight, alloc1832, model_encoder_layers_19_self_attn_q_proj_bias, alloc1833)
        R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_bias)
        gv2313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape152: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1833, gv2313, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1833)
        model_encoder_layers_19_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[290]
        gv2314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2314, R.dtype("float16"))
        _1832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_19_self_attn_k_proj_weight, alloc1832, alloc1834)
        R.vm.kill_object(model_encoder_layers_19_self_attn_k_proj_weight)
        gv2315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape153: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1834, gv2315, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1834)
        model_encoder_layers_19_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[291]
        model_encoder_layers_19_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[292]
        gv2316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2316, R.dtype("float16"))
        _1833: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_v_proj_weight, alloc1832, model_encoder_layers_19_self_attn_v_proj_bias, alloc1835)
        R.vm.kill_object(alloc1832)
        R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_bias)
        gv2317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape154: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1835, gv2317, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1835)
        gv2318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape155: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape152, gv2318, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape152)
        gv2319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape156: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape153, gv2319, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape153)
        gv2320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape157: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape154, gv2320, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape154)
        gv2321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2321, R.dtype("float16"))
        _1834: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape155, reshape156, reshape157, alloc1836)
        R.vm.kill_object(reshape155)
        R.vm.kill_object(reshape156)
        R.vm.kill_object(reshape157)
        gv2322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape158: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1836, gv2322, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1836)
        gv2323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape159: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape158, gv2323, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape158)
        model_encoder_layers_19_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[295]
        model_encoder_layers_19_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[296]
        gv2324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2324, R.dtype("float16"))
        _1835: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_out_proj_weight, reshape159, model_encoder_layers_19_self_attn_out_proj_bias, alloc1837)
        R.vm.kill_object(reshape159)
        R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_bias)
        gv2325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2325, R.dtype("float16"))
        cls.add4(alloc1831, alloc1837, alloc1838)
        R.vm.kill_object(alloc1831)
        R.vm.kill_object(alloc1837)
        model_encoder_layers_19_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[303]
        model_encoder_layers_19_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[304]
        gv2326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2326, R.dtype("float16"))
        cls.layer_norm1(alloc1838, model_encoder_layers_19_final_layer_norm_weight, model_encoder_layers_19_final_layer_norm_bias, alloc1839)
        R.vm.kill_object(model_encoder_layers_19_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_19_final_layer_norm_bias)
        model_encoder_layers_19_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[299]
        model_encoder_layers_19_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[300]
        gv2327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2327, R.dtype("float16"))
        _1838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_19_fc1_weight, alloc1839, model_encoder_layers_19_fc1_bias, alloc1840)
        R.vm.kill_object(alloc1839)
        R.vm.kill_object(model_encoder_layers_19_fc1_weight)
        R.vm.kill_object(model_encoder_layers_19_fc1_bias)
        model_encoder_layers_19_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[301]
        model_encoder_layers_19_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[302]
        gv2328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2328, R.dtype("float16"))
        _1839: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_19_fc2_weight, alloc1840, model_encoder_layers_19_fc2_bias, alloc1841)
        R.vm.kill_object(alloc1840)
        R.vm.kill_object(model_encoder_layers_19_fc2_weight)
        R.vm.kill_object(model_encoder_layers_19_fc2_bias)
        gv2329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2329, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1838, alloc1841, alloc1842)
        R.vm.kill_object(alloc1838)
        R.vm.kill_object(alloc1841)
        model_encoder_layers_20_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[312]
        model_encoder_layers_20_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[313]
        gv2330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2330, R.dtype("float16"))
        cls.layer_norm1(alloc1842, model_encoder_layers_20_self_attn_layer_norm_weight, model_encoder_layers_20_self_attn_layer_norm_bias, alloc1843)
        R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_bias)
        model_encoder_layers_20_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[308]
        model_encoder_layers_20_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[309]
        gv2331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2331, R.dtype("float16"))
        _1842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_q_proj_weight, alloc1843, model_encoder_layers_20_self_attn_q_proj_bias, alloc1844)
        R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_bias)
        gv2332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape160: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1844, gv2332, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1844)
        model_encoder_layers_20_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[305]
        gv2333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2333, R.dtype("float16"))
        _1843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_20_self_attn_k_proj_weight, alloc1843, alloc1845)
        R.vm.kill_object(model_encoder_layers_20_self_attn_k_proj_weight)
        gv2334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape161: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1845, gv2334, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1845)
        model_encoder_layers_20_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[306]
        model_encoder_layers_20_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[307]
        gv2335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1846: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2335, R.dtype("float16"))
        _1844: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_v_proj_weight, alloc1843, model_encoder_layers_20_self_attn_v_proj_bias, alloc1846)
        R.vm.kill_object(alloc1843)
        R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_bias)
        gv2336: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape162: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1846, gv2336, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1846)
        gv2337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape163: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape160, gv2337, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape160)
        gv2338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape164: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape161, gv2338, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape161)
        gv2339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape165: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape162, gv2339, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape162)
        gv2340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2340, R.dtype("float16"))
        _1845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape163, reshape164, reshape165, alloc1847)
        R.vm.kill_object(reshape163)
        R.vm.kill_object(reshape164)
        R.vm.kill_object(reshape165)
        gv2341: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape166: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1847, gv2341, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1847)
        gv2342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape167: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape166, gv2342, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape166)
        model_encoder_layers_20_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[310]
        model_encoder_layers_20_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[311]
        gv2343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2343, R.dtype("float16"))
        _1846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_out_proj_weight, reshape167, model_encoder_layers_20_self_attn_out_proj_bias, alloc1848)
        R.vm.kill_object(reshape167)
        R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_bias)
        gv2344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2344, R.dtype("float16"))
        cls.add4(alloc1842, alloc1848, alloc1849)
        R.vm.kill_object(alloc1842)
        R.vm.kill_object(alloc1848)
        model_encoder_layers_20_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[318]
        model_encoder_layers_20_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[319]
        gv2345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2345, R.dtype("float16"))
        cls.layer_norm1(alloc1849, model_encoder_layers_20_final_layer_norm_weight, model_encoder_layers_20_final_layer_norm_bias, alloc1850)
        R.vm.kill_object(model_encoder_layers_20_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_20_final_layer_norm_bias)
        model_encoder_layers_20_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[314]
        model_encoder_layers_20_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[315]
        gv2346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2346, R.dtype("float16"))
        _1849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_20_fc1_weight, alloc1850, model_encoder_layers_20_fc1_bias, alloc1851)
        R.vm.kill_object(alloc1850)
        R.vm.kill_object(model_encoder_layers_20_fc1_weight)
        R.vm.kill_object(model_encoder_layers_20_fc1_bias)
        model_encoder_layers_20_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[316]
        model_encoder_layers_20_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[317]
        gv2347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2347, R.dtype("float16"))
        _1850: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_20_fc2_weight, alloc1851, model_encoder_layers_20_fc2_bias, alloc1852)
        R.vm.kill_object(alloc1851)
        R.vm.kill_object(model_encoder_layers_20_fc2_weight)
        R.vm.kill_object(model_encoder_layers_20_fc2_bias)
        gv2348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2348, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1849, alloc1852, alloc1853)
        R.vm.kill_object(alloc1849)
        R.vm.kill_object(alloc1852)
        model_encoder_layers_21_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[327]
        model_encoder_layers_21_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[328]
        gv2349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2349, R.dtype("float16"))
        cls.layer_norm1(alloc1853, model_encoder_layers_21_self_attn_layer_norm_weight, model_encoder_layers_21_self_attn_layer_norm_bias, alloc1854)
        R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_bias)
        model_encoder_layers_21_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[323]
        model_encoder_layers_21_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[324]
        gv2350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2350, R.dtype("float16"))
        _1853: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_q_proj_weight, alloc1854, model_encoder_layers_21_self_attn_q_proj_bias, alloc1855)
        R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_bias)
        gv2351: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape168: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1855, gv2351, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1855)
        model_encoder_layers_21_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[320]
        gv2352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2352, R.dtype("float16"))
        _1854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_21_self_attn_k_proj_weight, alloc1854, alloc1856)
        R.vm.kill_object(model_encoder_layers_21_self_attn_k_proj_weight)
        gv2353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape169: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1856, gv2353, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1856)
        model_encoder_layers_21_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[321]
        model_encoder_layers_21_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[322]
        gv2354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2354, R.dtype("float16"))
        _1855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_v_proj_weight, alloc1854, model_encoder_layers_21_self_attn_v_proj_bias, alloc1857)
        R.vm.kill_object(alloc1854)
        R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_bias)
        gv2355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape170: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1857, gv2355, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1857)
        gv2356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape171: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape168, gv2356, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape168)
        gv2357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape172: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape169, gv2357, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape169)
        gv2358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape173: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape170, gv2358, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape170)
        gv2359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2359, R.dtype("float16"))
        _1856: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape171, reshape172, reshape173, alloc1858)
        R.vm.kill_object(reshape171)
        R.vm.kill_object(reshape172)
        R.vm.kill_object(reshape173)
        gv2360: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape174: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1858, gv2360, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1858)
        gv2361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape175: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape174, gv2361, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape174)
        model_encoder_layers_21_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[325]
        model_encoder_layers_21_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[326]
        gv2362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2362, R.dtype("float16"))
        _1857: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_out_proj_weight, reshape175, model_encoder_layers_21_self_attn_out_proj_bias, alloc1859)
        R.vm.kill_object(reshape175)
        R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_bias)
        gv2363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2363, R.dtype("float16"))
        cls.add4(alloc1853, alloc1859, alloc1860)
        R.vm.kill_object(alloc1853)
        R.vm.kill_object(alloc1859)
        model_encoder_layers_21_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[333]
        model_encoder_layers_21_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[334]
        gv2364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2364, R.dtype("float16"))
        cls.layer_norm1(alloc1860, model_encoder_layers_21_final_layer_norm_weight, model_encoder_layers_21_final_layer_norm_bias, alloc1861)
        R.vm.kill_object(model_encoder_layers_21_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_21_final_layer_norm_bias)
        model_encoder_layers_21_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[329]
        model_encoder_layers_21_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[330]
        gv2365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2365, R.dtype("float16"))
        _1860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_21_fc1_weight, alloc1861, model_encoder_layers_21_fc1_bias, alloc1862)
        R.vm.kill_object(alloc1861)
        R.vm.kill_object(model_encoder_layers_21_fc1_weight)
        R.vm.kill_object(model_encoder_layers_21_fc1_bias)
        model_encoder_layers_21_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[331]
        model_encoder_layers_21_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[332]
        gv2366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1863: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2366, R.dtype("float16"))
        _1861: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_21_fc2_weight, alloc1862, model_encoder_layers_21_fc2_bias, alloc1863)
        R.vm.kill_object(alloc1862)
        R.vm.kill_object(model_encoder_layers_21_fc2_weight)
        R.vm.kill_object(model_encoder_layers_21_fc2_bias)
        gv2367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2367, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1860, alloc1863, alloc1864)
        R.vm.kill_object(alloc1860)
        R.vm.kill_object(alloc1863)
        model_encoder_layers_22_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[342]
        model_encoder_layers_22_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[343]
        gv2368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2368, R.dtype("float16"))
        cls.layer_norm1(alloc1864, model_encoder_layers_22_self_attn_layer_norm_weight, model_encoder_layers_22_self_attn_layer_norm_bias, alloc1865)
        R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_bias)
        model_encoder_layers_22_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[338]
        model_encoder_layers_22_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[339]
        gv2369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2369, R.dtype("float16"))
        _1864: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_q_proj_weight, alloc1865, model_encoder_layers_22_self_attn_q_proj_bias, alloc1866)
        R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_bias)
        gv2370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape176: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1866, gv2370, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1866)
        model_encoder_layers_22_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[335]
        gv2371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2371, R.dtype("float16"))
        _1865: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_22_self_attn_k_proj_weight, alloc1865, alloc1867)
        R.vm.kill_object(model_encoder_layers_22_self_attn_k_proj_weight)
        gv2372: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape177: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1867, gv2372, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1867)
        model_encoder_layers_22_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[336]
        model_encoder_layers_22_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[337]
        gv2373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2373, R.dtype("float16"))
        _1866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_v_proj_weight, alloc1865, model_encoder_layers_22_self_attn_v_proj_bias, alloc1868)
        R.vm.kill_object(alloc1865)
        R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_bias)
        gv2374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape178: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1868, gv2374, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1868)
        gv2375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape179: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape176, gv2375, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape176)
        gv2376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape180: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape177, gv2376, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape177)
        gv2377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape181: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape178, gv2377, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape178)
        gv2378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2378, R.dtype("float16"))
        _1867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape179, reshape180, reshape181, alloc1869)
        R.vm.kill_object(reshape179)
        R.vm.kill_object(reshape180)
        R.vm.kill_object(reshape181)
        gv2379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape182: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1869, gv2379, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1869)
        gv2380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape183: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape182, gv2380, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape182)
        model_encoder_layers_22_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[340]
        model_encoder_layers_22_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[341]
        gv2381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2381, R.dtype("float16"))
        _1868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_out_proj_weight, reshape183, model_encoder_layers_22_self_attn_out_proj_bias, alloc1870)
        R.vm.kill_object(reshape183)
        R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_bias)
        gv2382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2382, R.dtype("float16"))
        cls.add4(alloc1864, alloc1870, alloc1871)
        R.vm.kill_object(alloc1864)
        R.vm.kill_object(alloc1870)
        model_encoder_layers_22_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[348]
        model_encoder_layers_22_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[349]
        gv2383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2383, R.dtype("float16"))
        cls.layer_norm1(alloc1871, model_encoder_layers_22_final_layer_norm_weight, model_encoder_layers_22_final_layer_norm_bias, alloc1872)
        R.vm.kill_object(model_encoder_layers_22_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_22_final_layer_norm_bias)
        model_encoder_layers_22_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[344]
        model_encoder_layers_22_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[345]
        gv2384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2384, R.dtype("float16"))
        _1871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_22_fc1_weight, alloc1872, model_encoder_layers_22_fc1_bias, alloc1873)
        R.vm.kill_object(alloc1872)
        R.vm.kill_object(model_encoder_layers_22_fc1_weight)
        R.vm.kill_object(model_encoder_layers_22_fc1_bias)
        model_encoder_layers_22_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[346]
        model_encoder_layers_22_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[347]
        gv2385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2385, R.dtype("float16"))
        _1872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_22_fc2_weight, alloc1873, model_encoder_layers_22_fc2_bias, alloc1874)
        R.vm.kill_object(alloc1873)
        R.vm.kill_object(model_encoder_layers_22_fc2_weight)
        R.vm.kill_object(model_encoder_layers_22_fc2_bias)
        gv2386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2386, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1871, alloc1874, alloc1875)
        R.vm.kill_object(alloc1871)
        R.vm.kill_object(alloc1874)
        model_encoder_layers_23_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[357]
        model_encoder_layers_23_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[358]
        gv2387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2387, R.dtype("float16"))
        cls.layer_norm1(alloc1875, model_encoder_layers_23_self_attn_layer_norm_weight, model_encoder_layers_23_self_attn_layer_norm_bias, alloc1876)
        R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_bias)
        model_encoder_layers_23_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[353]
        model_encoder_layers_23_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[354]
        gv2388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2388, R.dtype("float16"))
        _1875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_q_proj_weight, alloc1876, model_encoder_layers_23_self_attn_q_proj_bias, alloc1877)
        R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_bias)
        gv2389: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape184: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1877, gv2389, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1877)
        model_encoder_layers_23_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[350]
        gv2390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2390, R.dtype("float16"))
        _1876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_23_self_attn_k_proj_weight, alloc1876, alloc1878)
        R.vm.kill_object(model_encoder_layers_23_self_attn_k_proj_weight)
        gv2391: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape185: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1878, gv2391, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1878)
        model_encoder_layers_23_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[351]
        model_encoder_layers_23_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[352]
        gv2392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2392, R.dtype("float16"))
        _1877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_v_proj_weight, alloc1876, model_encoder_layers_23_self_attn_v_proj_bias, alloc1879)
        R.vm.kill_object(alloc1876)
        R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_bias)
        gv2393: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape186: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1879, gv2393, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1879)
        gv2394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape187: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape184, gv2394, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape184)
        gv2395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape188: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape185, gv2395, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape185)
        gv2396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape189: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape186, gv2396, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape186)
        gv2397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1880: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2397, R.dtype("float16"))
        _1878: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape187, reshape188, reshape189, alloc1880)
        R.vm.kill_object(reshape187)
        R.vm.kill_object(reshape188)
        R.vm.kill_object(reshape189)
        gv2398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape190: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1880, gv2398, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1880)
        gv2399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape191: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape190, gv2399, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape190)
        model_encoder_layers_23_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[355]
        model_encoder_layers_23_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[356]
        gv2400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2400, R.dtype("float16"))
        _1879: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_out_proj_weight, reshape191, model_encoder_layers_23_self_attn_out_proj_bias, alloc1881)
        R.vm.kill_object(reshape191)
        R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_bias)
        gv2401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2401, R.dtype("float16"))
        cls.add4(alloc1875, alloc1881, alloc1882)
        R.vm.kill_object(alloc1875)
        R.vm.kill_object(alloc1881)
        model_encoder_layers_23_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[363]
        model_encoder_layers_23_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[364]
        gv2402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2402, R.dtype("float16"))
        cls.layer_norm1(alloc1882, model_encoder_layers_23_final_layer_norm_weight, model_encoder_layers_23_final_layer_norm_bias, alloc1883)
        R.vm.kill_object(model_encoder_layers_23_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_23_final_layer_norm_bias)
        model_encoder_layers_23_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[359]
        model_encoder_layers_23_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[360]
        gv2403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2403, R.dtype("float16"))
        _1882: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_23_fc1_weight, alloc1883, model_encoder_layers_23_fc1_bias, alloc1884)
        R.vm.kill_object(alloc1883)
        R.vm.kill_object(model_encoder_layers_23_fc1_weight)
        R.vm.kill_object(model_encoder_layers_23_fc1_bias)
        model_encoder_layers_23_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[361]
        model_encoder_layers_23_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[362]
        gv2404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2404, R.dtype("float16"))
        _1883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_23_fc2_weight, alloc1884, model_encoder_layers_23_fc2_bias, alloc1885)
        R.vm.kill_object(alloc1884)
        R.vm.kill_object(model_encoder_layers_23_fc2_weight)
        R.vm.kill_object(model_encoder_layers_23_fc2_bias)
        gv2405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2405, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1882, alloc1885, alloc1886)
        R.vm.kill_object(alloc1882)
        R.vm.kill_object(alloc1885)
        model_encoder_layers_24_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[372]
        model_encoder_layers_24_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[373]
        gv2406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2406, R.dtype("float16"))
        cls.layer_norm1(alloc1886, model_encoder_layers_24_self_attn_layer_norm_weight, model_encoder_layers_24_self_attn_layer_norm_bias, alloc1887)
        R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_bias)
        model_encoder_layers_24_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[368]
        model_encoder_layers_24_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[369]
        gv2407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2407, R.dtype("float16"))
        _1886: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_q_proj_weight, alloc1887, model_encoder_layers_24_self_attn_q_proj_bias, alloc1888)
        R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_bias)
        gv2408: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape192: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1888, gv2408, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1888)
        model_encoder_layers_24_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[365]
        gv2409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2409, R.dtype("float16"))
        _1887: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_24_self_attn_k_proj_weight, alloc1887, alloc1889)
        R.vm.kill_object(model_encoder_layers_24_self_attn_k_proj_weight)
        gv2410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape193: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1889, gv2410, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1889)
        model_encoder_layers_24_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[366]
        model_encoder_layers_24_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[367]
        gv2411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2411, R.dtype("float16"))
        _1888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_v_proj_weight, alloc1887, model_encoder_layers_24_self_attn_v_proj_bias, alloc1890)
        R.vm.kill_object(alloc1887)
        R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_bias)
        gv2412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape194: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1890, gv2412, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1890)
        gv2413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape195: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape192, gv2413, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape192)
        gv2414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape196: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape193, gv2414, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape193)
        gv2415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape197: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape194, gv2415, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape194)
        gv2416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2416, R.dtype("float16"))
        _1889: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape195, reshape196, reshape197, alloc1891)
        R.vm.kill_object(reshape195)
        R.vm.kill_object(reshape196)
        R.vm.kill_object(reshape197)
        gv2417: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape198: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1891, gv2417, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1891)
        gv2418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape199: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape198, gv2418, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape198)
        model_encoder_layers_24_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[370]
        model_encoder_layers_24_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[371]
        gv2419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2419, R.dtype("float16"))
        _1890: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_out_proj_weight, reshape199, model_encoder_layers_24_self_attn_out_proj_bias, alloc1892)
        R.vm.kill_object(reshape199)
        R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_bias)
        gv2420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2420, R.dtype("float16"))
        cls.add4(alloc1886, alloc1892, alloc1893)
        R.vm.kill_object(alloc1886)
        R.vm.kill_object(alloc1892)
        model_encoder_layers_24_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[378]
        model_encoder_layers_24_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[379]
        gv2421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2421, R.dtype("float16"))
        cls.layer_norm1(alloc1893, model_encoder_layers_24_final_layer_norm_weight, model_encoder_layers_24_final_layer_norm_bias, alloc1894)
        R.vm.kill_object(model_encoder_layers_24_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_24_final_layer_norm_bias)
        model_encoder_layers_24_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[374]
        model_encoder_layers_24_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[375]
        gv2422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2422, R.dtype("float16"))
        _1893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_24_fc1_weight, alloc1894, model_encoder_layers_24_fc1_bias, alloc1895)
        R.vm.kill_object(alloc1894)
        R.vm.kill_object(model_encoder_layers_24_fc1_weight)
        R.vm.kill_object(model_encoder_layers_24_fc1_bias)
        model_encoder_layers_24_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[376]
        model_encoder_layers_24_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[377]
        gv2423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2423, R.dtype("float16"))
        _1894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_24_fc2_weight, alloc1895, model_encoder_layers_24_fc2_bias, alloc1896)
        R.vm.kill_object(alloc1895)
        R.vm.kill_object(model_encoder_layers_24_fc2_weight)
        R.vm.kill_object(model_encoder_layers_24_fc2_bias)
        gv2424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1897: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2424, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1893, alloc1896, alloc1897)
        R.vm.kill_object(alloc1893)
        R.vm.kill_object(alloc1896)
        model_encoder_layers_25_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[387]
        model_encoder_layers_25_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[388]
        gv2425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2425, R.dtype("float16"))
        cls.layer_norm1(alloc1897, model_encoder_layers_25_self_attn_layer_norm_weight, model_encoder_layers_25_self_attn_layer_norm_bias, alloc1898)
        R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_bias)
        model_encoder_layers_25_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[383]
        model_encoder_layers_25_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[384]
        gv2426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2426, R.dtype("float16"))
        _1897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_q_proj_weight, alloc1898, model_encoder_layers_25_self_attn_q_proj_bias, alloc1899)
        R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_bias)
        gv2427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape200: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1899, gv2427, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1899)
        model_encoder_layers_25_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[380]
        gv2428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2428, R.dtype("float16"))
        _1898: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_25_self_attn_k_proj_weight, alloc1898, alloc1900)
        R.vm.kill_object(model_encoder_layers_25_self_attn_k_proj_weight)
        gv2429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape201: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1900, gv2429, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1900)
        model_encoder_layers_25_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[381]
        model_encoder_layers_25_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[382]
        gv2430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2430, R.dtype("float16"))
        _1899: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_v_proj_weight, alloc1898, model_encoder_layers_25_self_attn_v_proj_bias, alloc1901)
        R.vm.kill_object(alloc1898)
        R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_bias)
        gv2431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape202: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1901, gv2431, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1901)
        gv2432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape203: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape200, gv2432, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape200)
        gv2433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape204: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape201, gv2433, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape201)
        gv2434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape205: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape202, gv2434, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape202)
        gv2435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2435, R.dtype("float16"))
        _1900: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape203, reshape204, reshape205, alloc1902)
        R.vm.kill_object(reshape203)
        R.vm.kill_object(reshape204)
        R.vm.kill_object(reshape205)
        gv2436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape206: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1902, gv2436, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1902)
        gv2437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape207: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape206, gv2437, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape206)
        model_encoder_layers_25_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[385]
        model_encoder_layers_25_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[386]
        gv2438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2438, R.dtype("float16"))
        _1901: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_out_proj_weight, reshape207, model_encoder_layers_25_self_attn_out_proj_bias, alloc1903)
        R.vm.kill_object(reshape207)
        R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_bias)
        gv2439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2439, R.dtype("float16"))
        cls.add4(alloc1897, alloc1903, alloc1904)
        R.vm.kill_object(alloc1897)
        R.vm.kill_object(alloc1903)
        model_encoder_layers_25_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[393]
        model_encoder_layers_25_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[394]
        gv2440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2440, R.dtype("float16"))
        cls.layer_norm1(alloc1904, model_encoder_layers_25_final_layer_norm_weight, model_encoder_layers_25_final_layer_norm_bias, alloc1905)
        R.vm.kill_object(model_encoder_layers_25_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_25_final_layer_norm_bias)
        model_encoder_layers_25_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[389]
        model_encoder_layers_25_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[390]
        gv2441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2441, R.dtype("float16"))
        _1904: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_25_fc1_weight, alloc1905, model_encoder_layers_25_fc1_bias, alloc1906)
        R.vm.kill_object(alloc1905)
        R.vm.kill_object(model_encoder_layers_25_fc1_weight)
        R.vm.kill_object(model_encoder_layers_25_fc1_bias)
        model_encoder_layers_25_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[391]
        model_encoder_layers_25_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[392]
        gv2442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2442, R.dtype("float16"))
        _1905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_25_fc2_weight, alloc1906, model_encoder_layers_25_fc2_bias, alloc1907)
        R.vm.kill_object(alloc1906)
        R.vm.kill_object(model_encoder_layers_25_fc2_weight)
        R.vm.kill_object(model_encoder_layers_25_fc2_bias)
        gv2443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2443, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1904, alloc1907, alloc1908)
        R.vm.kill_object(alloc1904)
        R.vm.kill_object(alloc1907)
        model_encoder_layers_26_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[402]
        model_encoder_layers_26_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[403]
        gv2444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2444, R.dtype("float16"))
        cls.layer_norm1(alloc1908, model_encoder_layers_26_self_attn_layer_norm_weight, model_encoder_layers_26_self_attn_layer_norm_bias, alloc1909)
        R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_bias)
        model_encoder_layers_26_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[398]
        model_encoder_layers_26_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[399]
        gv2445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2445, R.dtype("float16"))
        _1908: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_q_proj_weight, alloc1909, model_encoder_layers_26_self_attn_q_proj_bias, alloc1910)
        R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_bias)
        gv2446: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape208: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1910, gv2446, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1910)
        model_encoder_layers_26_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[395]
        gv2447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2447, R.dtype("float16"))
        _1909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_26_self_attn_k_proj_weight, alloc1909, alloc1911)
        R.vm.kill_object(model_encoder_layers_26_self_attn_k_proj_weight)
        gv2448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape209: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1911, gv2448, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1911)
        model_encoder_layers_26_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[396]
        model_encoder_layers_26_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[397]
        gv2449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2449, R.dtype("float16"))
        _1910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_v_proj_weight, alloc1909, model_encoder_layers_26_self_attn_v_proj_bias, alloc1912)
        R.vm.kill_object(alloc1909)
        R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_bias)
        gv2450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape210: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1912, gv2450, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1912)
        gv2451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape211: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape208, gv2451, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape208)
        gv2452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape212: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape209, gv2452, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape209)
        gv2453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape213: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape210, gv2453, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape210)
        gv2454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2454, R.dtype("float16"))
        _1911: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape211, reshape212, reshape213, alloc1913)
        R.vm.kill_object(reshape211)
        R.vm.kill_object(reshape212)
        R.vm.kill_object(reshape213)
        gv2455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape214: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1913, gv2455, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1913)
        gv2456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape215: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape214, gv2456, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape214)
        model_encoder_layers_26_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[400]
        model_encoder_layers_26_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[401]
        gv2457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1914: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2457, R.dtype("float16"))
        _1912: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_out_proj_weight, reshape215, model_encoder_layers_26_self_attn_out_proj_bias, alloc1914)
        R.vm.kill_object(reshape215)
        R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_bias)
        gv2458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2458, R.dtype("float16"))
        cls.add4(alloc1908, alloc1914, alloc1915)
        R.vm.kill_object(alloc1908)
        R.vm.kill_object(alloc1914)
        model_encoder_layers_26_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[408]
        model_encoder_layers_26_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[409]
        gv2459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2459, R.dtype("float16"))
        cls.layer_norm1(alloc1915, model_encoder_layers_26_final_layer_norm_weight, model_encoder_layers_26_final_layer_norm_bias, alloc1916)
        R.vm.kill_object(model_encoder_layers_26_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_26_final_layer_norm_bias)
        model_encoder_layers_26_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[404]
        model_encoder_layers_26_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[405]
        gv2460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2460, R.dtype("float16"))
        _1915: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_26_fc1_weight, alloc1916, model_encoder_layers_26_fc1_bias, alloc1917)
        R.vm.kill_object(alloc1916)
        R.vm.kill_object(model_encoder_layers_26_fc1_weight)
        R.vm.kill_object(model_encoder_layers_26_fc1_bias)
        model_encoder_layers_26_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[406]
        model_encoder_layers_26_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[407]
        gv2461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2461, R.dtype("float16"))
        _1916: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_26_fc2_weight, alloc1917, model_encoder_layers_26_fc2_bias, alloc1918)
        R.vm.kill_object(alloc1917)
        R.vm.kill_object(model_encoder_layers_26_fc2_weight)
        R.vm.kill_object(model_encoder_layers_26_fc2_bias)
        gv2462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2462, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1915, alloc1918, alloc1919)
        R.vm.kill_object(alloc1915)
        R.vm.kill_object(alloc1918)
        model_encoder_layers_27_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[417]
        model_encoder_layers_27_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[418]
        gv2463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2463, R.dtype("float16"))
        cls.layer_norm1(alloc1919, model_encoder_layers_27_self_attn_layer_norm_weight, model_encoder_layers_27_self_attn_layer_norm_bias, alloc1920)
        R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_bias)
        model_encoder_layers_27_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[413]
        model_encoder_layers_27_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[414]
        gv2464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2464, R.dtype("float16"))
        _1919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_q_proj_weight, alloc1920, model_encoder_layers_27_self_attn_q_proj_bias, alloc1921)
        R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_bias)
        gv2465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape216: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1921, gv2465, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1921)
        model_encoder_layers_27_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[410]
        gv2466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2466, R.dtype("float16"))
        _1920: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_27_self_attn_k_proj_weight, alloc1920, alloc1922)
        R.vm.kill_object(model_encoder_layers_27_self_attn_k_proj_weight)
        gv2467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape217: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1922, gv2467, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1922)
        model_encoder_layers_27_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[411]
        model_encoder_layers_27_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[412]
        gv2468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2468, R.dtype("float16"))
        _1921: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_v_proj_weight, alloc1920, model_encoder_layers_27_self_attn_v_proj_bias, alloc1923)
        R.vm.kill_object(alloc1920)
        R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_bias)
        gv2469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape218: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1923, gv2469, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1923)
        gv2470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape219: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape216, gv2470, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape216)
        gv2471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape220: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape217, gv2471, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape217)
        gv2472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape221: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape218, gv2472, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape218)
        gv2473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2473, R.dtype("float16"))
        _1922: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape219, reshape220, reshape221, alloc1924)
        R.vm.kill_object(reshape219)
        R.vm.kill_object(reshape220)
        R.vm.kill_object(reshape221)
        gv2474: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape222: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1924, gv2474, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1924)
        gv2475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape223: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape222, gv2475, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape222)
        model_encoder_layers_27_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[415]
        model_encoder_layers_27_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[416]
        gv2476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2476, R.dtype("float16"))
        _1923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_out_proj_weight, reshape223, model_encoder_layers_27_self_attn_out_proj_bias, alloc1925)
        R.vm.kill_object(reshape223)
        R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_bias)
        gv2477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2477, R.dtype("float16"))
        cls.add4(alloc1919, alloc1925, alloc1926)
        R.vm.kill_object(alloc1919)
        R.vm.kill_object(alloc1925)
        model_encoder_layers_27_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[423]
        model_encoder_layers_27_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[424]
        gv2478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2478, R.dtype("float16"))
        cls.layer_norm1(alloc1926, model_encoder_layers_27_final_layer_norm_weight, model_encoder_layers_27_final_layer_norm_bias, alloc1927)
        R.vm.kill_object(model_encoder_layers_27_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_27_final_layer_norm_bias)
        model_encoder_layers_27_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[419]
        model_encoder_layers_27_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[420]
        gv2479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2479, R.dtype("float16"))
        _1926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_27_fc1_weight, alloc1927, model_encoder_layers_27_fc1_bias, alloc1928)
        R.vm.kill_object(alloc1927)
        R.vm.kill_object(model_encoder_layers_27_fc1_weight)
        R.vm.kill_object(model_encoder_layers_27_fc1_bias)
        model_encoder_layers_27_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[421]
        model_encoder_layers_27_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[422]
        gv2480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2480, R.dtype("float16"))
        _1927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_27_fc2_weight, alloc1928, model_encoder_layers_27_fc2_bias, alloc1929)
        R.vm.kill_object(alloc1928)
        R.vm.kill_object(model_encoder_layers_27_fc2_weight)
        R.vm.kill_object(model_encoder_layers_27_fc2_bias)
        gv2481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2481, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1926, alloc1929, alloc1930)
        R.vm.kill_object(alloc1926)
        R.vm.kill_object(alloc1929)
        model_encoder_layers_28_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[432]
        model_encoder_layers_28_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[433]
        gv2482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1931: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2482, R.dtype("float16"))
        cls.layer_norm1(alloc1930, model_encoder_layers_28_self_attn_layer_norm_weight, model_encoder_layers_28_self_attn_layer_norm_bias, alloc1931)
        R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_bias)
        model_encoder_layers_28_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[428]
        model_encoder_layers_28_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[429]
        gv2483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2483, R.dtype("float16"))
        _1930: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_q_proj_weight, alloc1931, model_encoder_layers_28_self_attn_q_proj_bias, alloc1932)
        R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_bias)
        gv2484: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape224: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1932, gv2484, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1932)
        model_encoder_layers_28_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[425]
        gv2485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2485, R.dtype("float16"))
        _1931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_28_self_attn_k_proj_weight, alloc1931, alloc1933)
        R.vm.kill_object(model_encoder_layers_28_self_attn_k_proj_weight)
        gv2486: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape225: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1933, gv2486, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1933)
        model_encoder_layers_28_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[426]
        model_encoder_layers_28_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[427]
        gv2487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2487, R.dtype("float16"))
        _1932: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_v_proj_weight, alloc1931, model_encoder_layers_28_self_attn_v_proj_bias, alloc1934)
        R.vm.kill_object(alloc1931)
        R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_bias)
        gv2488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape226: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1934, gv2488, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1934)
        gv2489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape227: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape224, gv2489, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape224)
        gv2490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape228: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape225, gv2490, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape225)
        gv2491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape229: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape226, gv2491, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape226)
        gv2492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2492, R.dtype("float16"))
        _1933: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape227, reshape228, reshape229, alloc1935)
        R.vm.kill_object(reshape227)
        R.vm.kill_object(reshape228)
        R.vm.kill_object(reshape229)
        gv2493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape230: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1935, gv2493, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1935)
        gv2494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape231: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape230, gv2494, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape230)
        model_encoder_layers_28_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[430]
        model_encoder_layers_28_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[431]
        gv2495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2495, R.dtype("float16"))
        _1934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_out_proj_weight, reshape231, model_encoder_layers_28_self_attn_out_proj_bias, alloc1936)
        R.vm.kill_object(reshape231)
        R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_bias)
        gv2496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2496, R.dtype("float16"))
        cls.add4(alloc1930, alloc1936, alloc1937)
        R.vm.kill_object(alloc1930)
        R.vm.kill_object(alloc1936)
        model_encoder_layers_28_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[438]
        model_encoder_layers_28_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[439]
        gv2497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2497, R.dtype("float16"))
        cls.layer_norm1(alloc1937, model_encoder_layers_28_final_layer_norm_weight, model_encoder_layers_28_final_layer_norm_bias, alloc1938)
        R.vm.kill_object(model_encoder_layers_28_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_28_final_layer_norm_bias)
        model_encoder_layers_28_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[434]
        model_encoder_layers_28_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[435]
        gv2498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2498, R.dtype("float16"))
        _1937: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_28_fc1_weight, alloc1938, model_encoder_layers_28_fc1_bias, alloc1939)
        R.vm.kill_object(alloc1938)
        R.vm.kill_object(model_encoder_layers_28_fc1_weight)
        R.vm.kill_object(model_encoder_layers_28_fc1_bias)
        model_encoder_layers_28_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[436]
        model_encoder_layers_28_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[437]
        gv2499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2499, R.dtype("float16"))
        _1938: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_28_fc2_weight, alloc1939, model_encoder_layers_28_fc2_bias, alloc1940)
        R.vm.kill_object(alloc1939)
        R.vm.kill_object(model_encoder_layers_28_fc2_weight)
        R.vm.kill_object(model_encoder_layers_28_fc2_bias)
        gv2500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2500, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1937, alloc1940, alloc1941)
        R.vm.kill_object(alloc1937)
        R.vm.kill_object(alloc1940)
        model_encoder_layers_29_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[447]
        model_encoder_layers_29_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[448]
        gv2501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2501, R.dtype("float16"))
        cls.layer_norm1(alloc1941, model_encoder_layers_29_self_attn_layer_norm_weight, model_encoder_layers_29_self_attn_layer_norm_bias, alloc1942)
        R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_bias)
        model_encoder_layers_29_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[443]
        model_encoder_layers_29_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[444]
        gv2502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2502, R.dtype("float16"))
        _1941: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_q_proj_weight, alloc1942, model_encoder_layers_29_self_attn_q_proj_bias, alloc1943)
        R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_bias)
        gv2503: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape232: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1943, gv2503, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1943)
        model_encoder_layers_29_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[440]
        gv2504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2504, R.dtype("float16"))
        _1942: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_29_self_attn_k_proj_weight, alloc1942, alloc1944)
        R.vm.kill_object(model_encoder_layers_29_self_attn_k_proj_weight)
        gv2505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape233: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1944, gv2505, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1944)
        model_encoder_layers_29_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[441]
        model_encoder_layers_29_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[442]
        gv2506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2506, R.dtype("float16"))
        _1943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_v_proj_weight, alloc1942, model_encoder_layers_29_self_attn_v_proj_bias, alloc1945)
        R.vm.kill_object(alloc1942)
        R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_bias)
        gv2507: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape234: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1945, gv2507, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1945)
        gv2508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape235: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape232, gv2508, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape232)
        gv2509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape236: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape233, gv2509, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape233)
        gv2510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape237: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape234, gv2510, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape234)
        gv2511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2511, R.dtype("float16"))
        _1944: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape235, reshape236, reshape237, alloc1946)
        R.vm.kill_object(reshape235)
        R.vm.kill_object(reshape236)
        R.vm.kill_object(reshape237)
        gv2512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape238: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1946, gv2512, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1946)
        gv2513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape239: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape238, gv2513, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape238)
        model_encoder_layers_29_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[445]
        model_encoder_layers_29_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[446]
        gv2514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2514, R.dtype("float16"))
        _1945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_out_proj_weight, reshape239, model_encoder_layers_29_self_attn_out_proj_bias, alloc1947)
        R.vm.kill_object(reshape239)
        R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_bias)
        gv2515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1948: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2515, R.dtype("float16"))
        cls.add4(alloc1941, alloc1947, alloc1948)
        R.vm.kill_object(alloc1941)
        R.vm.kill_object(alloc1947)
        model_encoder_layers_29_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[453]
        model_encoder_layers_29_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[454]
        gv2516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2516, R.dtype("float16"))
        cls.layer_norm1(alloc1948, model_encoder_layers_29_final_layer_norm_weight, model_encoder_layers_29_final_layer_norm_bias, alloc1949)
        R.vm.kill_object(model_encoder_layers_29_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_29_final_layer_norm_bias)
        model_encoder_layers_29_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[449]
        model_encoder_layers_29_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[450]
        gv2517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2517, R.dtype("float16"))
        _1948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_29_fc1_weight, alloc1949, model_encoder_layers_29_fc1_bias, alloc1950)
        R.vm.kill_object(alloc1949)
        R.vm.kill_object(model_encoder_layers_29_fc1_weight)
        R.vm.kill_object(model_encoder_layers_29_fc1_bias)
        model_encoder_layers_29_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[451]
        model_encoder_layers_29_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[452]
        gv2518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2518, R.dtype("float16"))
        _1949: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_29_fc2_weight, alloc1950, model_encoder_layers_29_fc2_bias, alloc1951)
        R.vm.kill_object(alloc1950)
        R.vm.kill_object(model_encoder_layers_29_fc2_weight)
        R.vm.kill_object(model_encoder_layers_29_fc2_bias)
        gv2519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2519, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1948, alloc1951, alloc1952)
        R.vm.kill_object(alloc1948)
        R.vm.kill_object(alloc1951)
        model_encoder_layers_30_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[462]
        model_encoder_layers_30_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[463]
        gv2520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2520, R.dtype("float16"))
        cls.layer_norm1(alloc1952, model_encoder_layers_30_self_attn_layer_norm_weight, model_encoder_layers_30_self_attn_layer_norm_bias, alloc1953)
        R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_bias)
        model_encoder_layers_30_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[458]
        model_encoder_layers_30_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[459]
        gv2521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2521, R.dtype("float16"))
        _1952: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_q_proj_weight, alloc1953, model_encoder_layers_30_self_attn_q_proj_bias, alloc1954)
        R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_bias)
        gv2522: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape240: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1954, gv2522, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1954)
        model_encoder_layers_30_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[455]
        gv2523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2523, R.dtype("float16"))
        _1953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_30_self_attn_k_proj_weight, alloc1953, alloc1955)
        R.vm.kill_object(model_encoder_layers_30_self_attn_k_proj_weight)
        gv2524: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape241: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1955, gv2524, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1955)
        model_encoder_layers_30_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[456]
        model_encoder_layers_30_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[457]
        gv2525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2525, R.dtype("float16"))
        _1954: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_v_proj_weight, alloc1953, model_encoder_layers_30_self_attn_v_proj_bias, alloc1956)
        R.vm.kill_object(alloc1953)
        R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_bias)
        gv2526: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape242: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1956, gv2526, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1956)
        gv2527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape243: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape240, gv2527, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape240)
        gv2528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape244: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape241, gv2528, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape241)
        gv2529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape245: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape242, gv2529, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape242)
        gv2530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2530, R.dtype("float16"))
        _1955: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape243, reshape244, reshape245, alloc1957)
        R.vm.kill_object(reshape243)
        R.vm.kill_object(reshape244)
        R.vm.kill_object(reshape245)
        gv2531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape246: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1957, gv2531, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1957)
        gv2532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape247: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape246, gv2532, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape246)
        model_encoder_layers_30_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[460]
        model_encoder_layers_30_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[461]
        gv2533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2533, R.dtype("float16"))
        _1956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_out_proj_weight, reshape247, model_encoder_layers_30_self_attn_out_proj_bias, alloc1958)
        R.vm.kill_object(reshape247)
        R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_bias)
        gv2534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2534, R.dtype("float16"))
        cls.add4(alloc1952, alloc1958, alloc1959)
        R.vm.kill_object(alloc1952)
        R.vm.kill_object(alloc1958)
        model_encoder_layers_30_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[468]
        model_encoder_layers_30_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[469]
        gv2535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2535, R.dtype("float16"))
        cls.layer_norm1(alloc1959, model_encoder_layers_30_final_layer_norm_weight, model_encoder_layers_30_final_layer_norm_bias, alloc1960)
        R.vm.kill_object(model_encoder_layers_30_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_30_final_layer_norm_bias)
        model_encoder_layers_30_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[464]
        model_encoder_layers_30_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[465]
        gv2536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2536, R.dtype("float16"))
        _1959: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_30_fc1_weight, alloc1960, model_encoder_layers_30_fc1_bias, alloc1961)
        R.vm.kill_object(alloc1960)
        R.vm.kill_object(model_encoder_layers_30_fc1_weight)
        R.vm.kill_object(model_encoder_layers_30_fc1_bias)
        model_encoder_layers_30_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[466]
        model_encoder_layers_30_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[467]
        gv2537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2537, R.dtype("float16"))
        _1960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_30_fc2_weight, alloc1961, model_encoder_layers_30_fc2_bias, alloc1962)
        R.vm.kill_object(alloc1961)
        R.vm.kill_object(model_encoder_layers_30_fc2_weight)
        R.vm.kill_object(model_encoder_layers_30_fc2_bias)
        gv2538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2538, R.dtype("float16"))
        cls.fused_add4_maximum_minimum(alloc1959, alloc1962, alloc1963)
        R.vm.kill_object(alloc1959)
        R.vm.kill_object(alloc1962)
        model_encoder_layers_31_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[477]
        model_encoder_layers_31_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[478]
        gv2539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2539, R.dtype("float16"))
        cls.layer_norm1(alloc1963, model_encoder_layers_31_self_attn_layer_norm_weight, model_encoder_layers_31_self_attn_layer_norm_bias, alloc1964)
        R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_bias)
        model_encoder_layers_31_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[473]
        model_encoder_layers_31_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[474]
        gv2540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1965: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2540, R.dtype("float16"))
        _1963: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_q_proj_weight, alloc1964, model_encoder_layers_31_self_attn_q_proj_bias, alloc1965)
        R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_weight)
        R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_bias)
        gv2541: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape248: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1965, gv2541, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1965)
        model_encoder_layers_31_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[470]
        gv2542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2542, R.dtype("float16"))
        _1964: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_31_self_attn_k_proj_weight, alloc1964, alloc1966)
        R.vm.kill_object(model_encoder_layers_31_self_attn_k_proj_weight)
        gv2543: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape249: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1966, gv2543, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1966)
        model_encoder_layers_31_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[471]
        model_encoder_layers_31_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[472]
        gv2544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2544, R.dtype("float16"))
        _1965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_v_proj_weight, alloc1964, model_encoder_layers_31_self_attn_v_proj_bias, alloc1967)
        R.vm.kill_object(alloc1964)
        R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_weight)
        R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_bias)
        gv2545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape250: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1967, gv2545, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1967)
        gv2546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape251: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape248, gv2546, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape248)
        gv2547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape252: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape249, gv2547, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape249)
        gv2548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape253: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape250, gv2548, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape250)
        gv2549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2549, R.dtype("float16"))
        _1966: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape251, reshape252, reshape253, alloc1968)
        R.vm.kill_object(reshape251)
        R.vm.kill_object(reshape252)
        R.vm.kill_object(reshape253)
        gv2550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape254: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1968, gv2550, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1968)
        gv2551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape255: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape254, gv2551, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),))
        R.vm.kill_object(reshape254)
        model_encoder_layers_31_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[475]
        model_encoder_layers_31_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[476]
        gv2552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2552, R.dtype("float16"))
        _1967: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_out_proj_weight, reshape255, model_encoder_layers_31_self_attn_out_proj_bias, alloc1969)
        R.vm.kill_object(reshape255)
        R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_weight)
        R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_bias)
        gv2553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2553, R.dtype("float16"))
        R.vm.kill_object(storage25)
        cls.add4(alloc1963, alloc1969, alloc1970)
        R.vm.kill_object(alloc1963)
        R.vm.kill_object(alloc1969)
        model_encoder_layers_31_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[483]
        model_encoder_layers_31_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[484]
        gv2554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2554, R.dtype("float16"))
        R.vm.kill_object(storage28)
        cls.layer_norm1(alloc1970, model_encoder_layers_31_final_layer_norm_weight, model_encoder_layers_31_final_layer_norm_bias, alloc1971)
        R.vm.kill_object(model_encoder_layers_31_final_layer_norm_weight)
        R.vm.kill_object(model_encoder_layers_31_final_layer_norm_bias)
        model_encoder_layers_31_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[479]
        model_encoder_layers_31_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[480]
        gv2555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2555, R.dtype("float16"))
        R.vm.kill_object(storage24)
        _1970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_31_fc1_weight, alloc1971, model_encoder_layers_31_fc1_bias, alloc1972)
        R.vm.kill_object(alloc1971)
        R.vm.kill_object(model_encoder_layers_31_fc1_weight)
        R.vm.kill_object(model_encoder_layers_31_fc1_bias)
        model_encoder_layers_31_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[481]
        model_encoder_layers_31_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[482]
        gv2556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2556, R.dtype("float16"))
        R.vm.kill_object(storage26)
        _1971: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_31_fc2_weight, alloc1972, model_encoder_layers_31_fc2_bias, alloc1973)
        R.vm.kill_object(alloc1972)
        R.vm.kill_object(model_encoder_layers_31_fc2_weight)
        R.vm.kill_object(model_encoder_layers_31_fc2_bias)
        gv2557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2557, R.dtype("float16"))
        R.vm.kill_object(storage27)
        cls.fused_add4_maximum_minimum(alloc1970, alloc1973, alloc1974)
        R.vm.kill_object(alloc1970)
        R.vm.kill_object(alloc1973)
        model_encoder_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[485]
        model_encoder_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[486]
        storage29: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage29, R.prim_value(0), gv2558, R.dtype("float16"))
        R.vm.kill_object(storage29)
        cls.layer_norm1(alloc1974, model_encoder_layer_norm_weight, model_encoder_layer_norm_bias, alloc1975)
        R.vm.kill_object(alloc1974)
        R.vm.kill_object(model_encoder_layer_norm_weight)
        R.vm.kill_object(model_encoder_layer_norm_bias)
        R.call_packed("vm.builtin.match_shape", alloc1975, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_encode, loc=return, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,))
        return alloc1975

    @R.function
    def batch_prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), logit_positions: R.Tensor(("batch_size",), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, "batch_size", 51866), dtype="float32"):
        batch_size = T.int64()
        seq_len = T.int64()
        R.func_attr({"num_input": 3, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", logit_positions, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_prefill, loc=param[3], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", logit_positions, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        model_decoder_embed_tokens_weight2: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        gv10: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),))
        reshape384: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv10, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),))
        model_decoder_embed_tokens_weight2_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        storage4: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv11: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc4: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv11, R.dtype("float16"))
        cls.take(model_decoder_embed_tokens_weight2_1, reshape384, alloc4)
        R.vm.kill_object(reshape384)
        R.vm.kill_object(model_decoder_embed_tokens_weight2_1)
        gv12: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape385: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc4, gv12, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(alloc4)
        lv68: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),))
        model_decoder_embed_positions_weight2: R.Tensor((448, 1280), dtype="float16") = packed_params[488]
        storage5: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv13: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc5: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv13, R.dtype("float16"))
        cls.take1(model_decoder_embed_positions_weight2, lv68, alloc5)
        R.vm.kill_object(lv68)
        R.vm.kill_object(model_decoder_embed_positions_weight2)
        gv14: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape386: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc5, gv14, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(alloc5)
        storage6: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv15: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc6: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv15, R.dtype("float16"))
        cls.add5(reshape385, reshape386, alloc6)
        R.vm.kill_object(reshape385)
        R.vm.kill_object(reshape386)
        model_decoder_layers_0_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[496]
        model_decoder_layers_0_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[497]
        gv16: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc7: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv16, R.dtype("float16"))
        cls.layer_norm2(alloc6, model_decoder_layers_0_self_attn_layer_norm_weight2, model_decoder_layers_0_self_attn_layer_norm_bias2, alloc7)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias2)
        model_decoder_layers_0_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[492]
        model_decoder_layers_0_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[493]
        gv17: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc8: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv17, R.dtype("float16"))
        _6: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight2, alloc7, model_decoder_layers_0_self_attn_q_proj_bias2, alloc8)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias2)
        gv18: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape387: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc8, gv18, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc8)
        model_decoder_layers_0_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[489]
        storage7: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv19: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc9: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv19, R.dtype("float16"))
        _7: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight2, alloc7, alloc9)
        R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight2)
        gv20: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape388: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc9, gv20, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc9)
        model_decoder_layers_0_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[490]
        model_decoder_layers_0_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[491]
        storage8: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv21: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc10: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv21, R.dtype("float16"))
        _8: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight2, alloc7, model_decoder_layers_0_self_attn_v_proj_bias2, alloc10)
        R.vm.kill_object(alloc7)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias2)
        gv22: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape389: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc10, gv22, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc10)
        gv23: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc11: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv23, R.dtype("float16"))
        cls.concatenate1(reshape387, reshape388, reshape389, alloc11)
        R.vm.kill_object(reshape387)
        R.vm.kill_object(reshape388)
        R.vm.kill_object(reshape389)
        gv24: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape390: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc11, gv24, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc11)
        gv25: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc12: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv25, R.dtype("float16"))
        _10: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape390, alloc12)
        R.vm.kill_object(reshape390)
        gv26: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape391: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc12, gv26, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc12)
        gv27: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape392: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape391, gv27, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape391)
        model_decoder_layers_0_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[494]
        model_decoder_layers_0_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[495]
        gv28: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc13: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv28, R.dtype("float16"))
        _11: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight2, reshape392, model_decoder_layers_0_self_attn_out_proj_bias2, alloc13)
        R.vm.kill_object(reshape392)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias2)
        gv29: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc14: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv29, R.dtype("float16"))
        cls.add5(alloc6, alloc13, alloc14)
        R.vm.kill_object(alloc6)
        R.vm.kill_object(alloc13)
        model_decoder_layers_0_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[505]
        model_decoder_layers_0_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[506]
        gv30: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc15: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv30, R.dtype("float16"))
        cls.layer_norm2(alloc14, model_decoder_layers_0_encoder_attn_layer_norm_weight2, model_decoder_layers_0_encoder_attn_layer_norm_bias2, alloc15)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias2)
        model_decoder_layers_0_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[501]
        model_decoder_layers_0_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[502]
        gv31: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc16: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv31, R.dtype("float16"))
        _14: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight2, alloc15, model_decoder_layers_0_encoder_attn_q_proj_bias2, alloc16)
        R.vm.kill_object(alloc15)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias2)
        gv32: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape393: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc16, gv32, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc16)
        gv33: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape394: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape393, gv33, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape393)
        gv34: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc17: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv34, R.dtype("float16"))
        _15: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape394, alloc17)
        R.vm.kill_object(reshape394)
        gv35: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape395: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc17, gv35, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc17)
        gv36: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape396: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape395, gv36, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape395)
        model_decoder_layers_0_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[503]
        model_decoder_layers_0_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[504]
        gv37: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc18: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv37, R.dtype("float16"))
        _16: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight2, reshape396, model_decoder_layers_0_encoder_attn_out_proj_bias2, alloc18)
        R.vm.kill_object(reshape396)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias2)
        gv38: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc19: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv38, R.dtype("float16"))
        cls.add5(alloc14, alloc18, alloc19)
        R.vm.kill_object(alloc14)
        R.vm.kill_object(alloc18)
        model_decoder_layers_0_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[511]
        model_decoder_layers_0_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[512]
        gv39: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc20: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv39, R.dtype("float16"))
        cls.layer_norm2(alloc19, model_decoder_layers_0_final_layer_norm_weight2, model_decoder_layers_0_final_layer_norm_bias2, alloc20)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias2)
        model_decoder_layers_0_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[507]
        model_decoder_layers_0_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[508]
        gv40: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc21: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv40, R.dtype("float16"))
        _19: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight2, alloc20, model_decoder_layers_0_fc1_bias2, alloc21)
        R.vm.kill_object(alloc20)
        R.vm.kill_object(model_decoder_layers_0_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_0_fc1_bias2)
        model_decoder_layers_0_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[509]
        model_decoder_layers_0_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[510]
        gv41: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc22: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv41, R.dtype("float16"))
        _20: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight2, alloc21, model_decoder_layers_0_fc2_bias2, alloc22)
        R.vm.kill_object(alloc21)
        R.vm.kill_object(model_decoder_layers_0_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_0_fc2_bias2)
        gv42: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc23: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv42, R.dtype("float16"))
        cls.add5(alloc19, alloc22, alloc23)
        R.vm.kill_object(alloc19)
        R.vm.kill_object(alloc22)
        model_decoder_layers_1_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[520]
        model_decoder_layers_1_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[521]
        gv43: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc24: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv43, R.dtype("float16"))
        cls.layer_norm2(alloc23, model_decoder_layers_1_self_attn_layer_norm_weight2, model_decoder_layers_1_self_attn_layer_norm_bias2, alloc24)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias2)
        model_decoder_layers_1_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[516]
        model_decoder_layers_1_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[517]
        gv44: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc25: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv44, R.dtype("float16"))
        _23: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight2, alloc24, model_decoder_layers_1_self_attn_q_proj_bias2, alloc25)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias2)
        gv45: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape397: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc25, gv45, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc25)
        model_decoder_layers_1_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[513]
        gv46: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc26: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv46, R.dtype("float16"))
        _24: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight2, alloc24, alloc26)
        R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight2)
        gv47: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape398: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc26, gv47, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc26)
        model_decoder_layers_1_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[514]
        model_decoder_layers_1_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[515]
        gv48: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc27: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv48, R.dtype("float16"))
        _25: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight2, alloc24, model_decoder_layers_1_self_attn_v_proj_bias2, alloc27)
        R.vm.kill_object(alloc24)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias2)
        gv49: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape399: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc27, gv49, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc27)
        gv50: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc28: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv50, R.dtype("float16"))
        cls.concatenate1(reshape397, reshape398, reshape399, alloc28)
        R.vm.kill_object(reshape397)
        R.vm.kill_object(reshape398)
        R.vm.kill_object(reshape399)
        gv51: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape400: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc28, gv51, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc28)
        gv52: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc29: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv52, R.dtype("float16"))
        _27: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape400, alloc29)
        R.vm.kill_object(reshape400)
        gv53: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape401: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc29, gv53, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc29)
        gv54: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape402: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape401, gv54, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape401)
        model_decoder_layers_1_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[518]
        model_decoder_layers_1_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[519]
        gv55: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc30: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv55, R.dtype("float16"))
        _28: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight2, reshape402, model_decoder_layers_1_self_attn_out_proj_bias2, alloc30)
        R.vm.kill_object(reshape402)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias2)
        gv56: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc31: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv56, R.dtype("float16"))
        cls.add5(alloc23, alloc30, alloc31)
        R.vm.kill_object(alloc23)
        R.vm.kill_object(alloc30)
        model_decoder_layers_1_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[529]
        model_decoder_layers_1_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[530]
        gv57: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc32: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv57, R.dtype("float16"))
        cls.layer_norm2(alloc31, model_decoder_layers_1_encoder_attn_layer_norm_weight2, model_decoder_layers_1_encoder_attn_layer_norm_bias2, alloc32)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias2)
        model_decoder_layers_1_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[525]
        model_decoder_layers_1_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[526]
        gv58: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc33: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv58, R.dtype("float16"))
        _31: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight2, alloc32, model_decoder_layers_1_encoder_attn_q_proj_bias2, alloc33)
        R.vm.kill_object(alloc32)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias2)
        gv59: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape403: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc33, gv59, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc33)
        gv60: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape404: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape403, gv60, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape403)
        gv61: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc34: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv61, R.dtype("float16"))
        _32: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape404, alloc34)
        R.vm.kill_object(reshape404)
        gv62: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape405: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc34, gv62, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc34)
        gv63: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape406: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape405, gv63, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape405)
        model_decoder_layers_1_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[527]
        model_decoder_layers_1_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[528]
        gv64: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc35: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv64, R.dtype("float16"))
        _33: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight2, reshape406, model_decoder_layers_1_encoder_attn_out_proj_bias2, alloc35)
        R.vm.kill_object(reshape406)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias2)
        gv65: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc36: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv65, R.dtype("float16"))
        cls.add5(alloc31, alloc35, alloc36)
        R.vm.kill_object(alloc31)
        R.vm.kill_object(alloc35)
        model_decoder_layers_1_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[535]
        model_decoder_layers_1_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[536]
        gv66: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc37: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv66, R.dtype("float16"))
        cls.layer_norm2(alloc36, model_decoder_layers_1_final_layer_norm_weight2, model_decoder_layers_1_final_layer_norm_bias2, alloc37)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias2)
        model_decoder_layers_1_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[531]
        model_decoder_layers_1_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[532]
        gv67: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc38: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv67, R.dtype("float16"))
        _36: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight2, alloc37, model_decoder_layers_1_fc1_bias2, alloc38)
        R.vm.kill_object(alloc37)
        R.vm.kill_object(model_decoder_layers_1_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_1_fc1_bias2)
        model_decoder_layers_1_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[533]
        model_decoder_layers_1_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[534]
        gv68: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc39: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv68, R.dtype("float16"))
        _37: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight2, alloc38, model_decoder_layers_1_fc2_bias2, alloc39)
        R.vm.kill_object(alloc38)
        R.vm.kill_object(model_decoder_layers_1_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_1_fc2_bias2)
        gv69: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc40: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv69, R.dtype("float16"))
        cls.add5(alloc36, alloc39, alloc40)
        R.vm.kill_object(alloc36)
        R.vm.kill_object(alloc39)
        model_decoder_layers_2_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[544]
        model_decoder_layers_2_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[545]
        gv70: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc41: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv70, R.dtype("float16"))
        cls.layer_norm2(alloc40, model_decoder_layers_2_self_attn_layer_norm_weight2, model_decoder_layers_2_self_attn_layer_norm_bias2, alloc41)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias2)
        model_decoder_layers_2_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[540]
        model_decoder_layers_2_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[541]
        gv71: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc42: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv71, R.dtype("float16"))
        _40: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight2, alloc41, model_decoder_layers_2_self_attn_q_proj_bias2, alloc42)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias2)
        gv72: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape407: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc42, gv72, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc42)
        model_decoder_layers_2_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[537]
        gv73: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc43: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv73, R.dtype("float16"))
        _41: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight2, alloc41, alloc43)
        R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight2)
        gv74: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape408: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc43, gv74, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc43)
        model_decoder_layers_2_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[538]
        model_decoder_layers_2_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[539]
        gv75: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc44: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv75, R.dtype("float16"))
        _42: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight2, alloc41, model_decoder_layers_2_self_attn_v_proj_bias2, alloc44)
        R.vm.kill_object(alloc41)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias2)
        gv76: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape409: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc44, gv76, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc44)
        gv77: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc45: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv77, R.dtype("float16"))
        cls.concatenate1(reshape407, reshape408, reshape409, alloc45)
        R.vm.kill_object(reshape407)
        R.vm.kill_object(reshape408)
        R.vm.kill_object(reshape409)
        gv78: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape410: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc45, gv78, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc45)
        gv79: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc46: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv79, R.dtype("float16"))
        _44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape410, alloc46)
        R.vm.kill_object(reshape410)
        gv80: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape411: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc46, gv80, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc46)
        gv81: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape412: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape411, gv81, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape411)
        model_decoder_layers_2_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[542]
        model_decoder_layers_2_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[543]
        gv82: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc47: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv82, R.dtype("float16"))
        _45: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight2, reshape412, model_decoder_layers_2_self_attn_out_proj_bias2, alloc47)
        R.vm.kill_object(reshape412)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias2)
        gv83: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc48: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv83, R.dtype("float16"))
        cls.add5(alloc40, alloc47, alloc48)
        R.vm.kill_object(alloc40)
        R.vm.kill_object(alloc47)
        model_decoder_layers_2_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[553]
        model_decoder_layers_2_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[554]
        gv84: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc49: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv84, R.dtype("float16"))
        cls.layer_norm2(alloc48, model_decoder_layers_2_encoder_attn_layer_norm_weight2, model_decoder_layers_2_encoder_attn_layer_norm_bias2, alloc49)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias2)
        model_decoder_layers_2_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[549]
        model_decoder_layers_2_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[550]
        gv85: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc50: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv85, R.dtype("float16"))
        _48: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight2, alloc49, model_decoder_layers_2_encoder_attn_q_proj_bias2, alloc50)
        R.vm.kill_object(alloc49)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias2)
        gv86: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape413: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc50, gv86, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc50)
        gv87: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape414: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape413, gv87, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape413)
        gv88: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc51: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv88, R.dtype("float16"))
        _49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape414, alloc51)
        R.vm.kill_object(reshape414)
        gv89: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape415: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc51, gv89, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc51)
        gv90: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape416: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape415, gv90, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape415)
        model_decoder_layers_2_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[551]
        model_decoder_layers_2_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[552]
        gv91: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc52: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv91, R.dtype("float16"))
        _50: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight2, reshape416, model_decoder_layers_2_encoder_attn_out_proj_bias2, alloc52)
        R.vm.kill_object(reshape416)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias2)
        gv92: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc53: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv92, R.dtype("float16"))
        cls.add5(alloc48, alloc52, alloc53)
        R.vm.kill_object(alloc48)
        R.vm.kill_object(alloc52)
        model_decoder_layers_2_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[559]
        model_decoder_layers_2_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[560]
        gv93: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc54: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv93, R.dtype("float16"))
        cls.layer_norm2(alloc53, model_decoder_layers_2_final_layer_norm_weight2, model_decoder_layers_2_final_layer_norm_bias2, alloc54)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias2)
        model_decoder_layers_2_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[555]
        model_decoder_layers_2_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[556]
        gv94: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc55: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv94, R.dtype("float16"))
        _53: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight2, alloc54, model_decoder_layers_2_fc1_bias2, alloc55)
        R.vm.kill_object(alloc54)
        R.vm.kill_object(model_decoder_layers_2_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_2_fc1_bias2)
        model_decoder_layers_2_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[557]
        model_decoder_layers_2_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[558]
        gv95: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc56: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv95, R.dtype("float16"))
        _54: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight2, alloc55, model_decoder_layers_2_fc2_bias2, alloc56)
        R.vm.kill_object(alloc55)
        R.vm.kill_object(model_decoder_layers_2_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_2_fc2_bias2)
        gv96: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc57: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv96, R.dtype("float16"))
        cls.add5(alloc53, alloc56, alloc57)
        R.vm.kill_object(alloc53)
        R.vm.kill_object(alloc56)
        model_decoder_layers_3_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[568]
        model_decoder_layers_3_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[569]
        gv97: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc58: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv97, R.dtype("float16"))
        cls.layer_norm2(alloc57, model_decoder_layers_3_self_attn_layer_norm_weight2, model_decoder_layers_3_self_attn_layer_norm_bias2, alloc58)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias2)
        model_decoder_layers_3_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[564]
        model_decoder_layers_3_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[565]
        gv98: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc59: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv98, R.dtype("float16"))
        _57: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight2, alloc58, model_decoder_layers_3_self_attn_q_proj_bias2, alloc59)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias2)
        gv99: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape417: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc59, gv99, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc59)
        model_decoder_layers_3_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[561]
        gv100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc60: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv100, R.dtype("float16"))
        _58: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight2, alloc58, alloc60)
        R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight2)
        gv101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape418: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc60, gv101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc60)
        model_decoder_layers_3_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[562]
        model_decoder_layers_3_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[563]
        gv102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc61: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv102, R.dtype("float16"))
        _59: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight2, alloc58, model_decoder_layers_3_self_attn_v_proj_bias2, alloc61)
        R.vm.kill_object(alloc58)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias2)
        gv103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape419: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc61, gv103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc61)
        gv104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc62: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv104, R.dtype("float16"))
        cls.concatenate1(reshape417, reshape418, reshape419, alloc62)
        R.vm.kill_object(reshape417)
        R.vm.kill_object(reshape418)
        R.vm.kill_object(reshape419)
        gv105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape420: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc62, gv105, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc62)
        gv106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc63: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv106, R.dtype("float16"))
        _61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape420, alloc63)
        R.vm.kill_object(reshape420)
        gv107: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape421: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc63, gv107, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc63)
        gv108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape422: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape421, gv108, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape421)
        model_decoder_layers_3_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[566]
        model_decoder_layers_3_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[567]
        gv109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc64: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv109, R.dtype("float16"))
        _62: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight2, reshape422, model_decoder_layers_3_self_attn_out_proj_bias2, alloc64)
        R.vm.kill_object(reshape422)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias2)
        gv110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc65: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv110, R.dtype("float16"))
        cls.add5(alloc57, alloc64, alloc65)
        R.vm.kill_object(alloc57)
        R.vm.kill_object(alloc64)
        model_decoder_layers_3_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[577]
        model_decoder_layers_3_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[578]
        gv111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc66: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv111, R.dtype("float16"))
        cls.layer_norm2(alloc65, model_decoder_layers_3_encoder_attn_layer_norm_weight2, model_decoder_layers_3_encoder_attn_layer_norm_bias2, alloc66)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias2)
        model_decoder_layers_3_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[573]
        model_decoder_layers_3_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[574]
        gv112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc67: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv112, R.dtype("float16"))
        _65: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight2, alloc66, model_decoder_layers_3_encoder_attn_q_proj_bias2, alloc67)
        R.vm.kill_object(alloc66)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias2)
        gv113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape423: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc67, gv113, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc67)
        gv114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape424: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape423, gv114, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape423)
        gv115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc68: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv115, R.dtype("float16"))
        _66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape424, alloc68)
        R.vm.kill_object(reshape424)
        gv116: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape425: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc68, gv116, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc68)
        gv117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape426: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape425, gv117, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape425)
        model_decoder_layers_3_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[575]
        model_decoder_layers_3_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[576]
        gv118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc69: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv118, R.dtype("float16"))
        _67: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight2, reshape426, model_decoder_layers_3_encoder_attn_out_proj_bias2, alloc69)
        R.vm.kill_object(reshape426)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias2)
        gv119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc70: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv119, R.dtype("float16"))
        cls.add5(alloc65, alloc69, alloc70)
        R.vm.kill_object(alloc65)
        R.vm.kill_object(alloc69)
        model_decoder_layers_3_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[583]
        model_decoder_layers_3_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[584]
        gv120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc71: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv120, R.dtype("float16"))
        cls.layer_norm2(alloc70, model_decoder_layers_3_final_layer_norm_weight2, model_decoder_layers_3_final_layer_norm_bias2, alloc71)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias2)
        model_decoder_layers_3_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[579]
        model_decoder_layers_3_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[580]
        gv121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc72: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv121, R.dtype("float16"))
        _70: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight2, alloc71, model_decoder_layers_3_fc1_bias2, alloc72)
        R.vm.kill_object(alloc71)
        R.vm.kill_object(model_decoder_layers_3_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_3_fc1_bias2)
        model_decoder_layers_3_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[581]
        model_decoder_layers_3_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[582]
        gv122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc73: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv122, R.dtype("float16"))
        _71: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight2, alloc72, model_decoder_layers_3_fc2_bias2, alloc73)
        R.vm.kill_object(alloc72)
        R.vm.kill_object(model_decoder_layers_3_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_3_fc2_bias2)
        gv123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc74: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv123, R.dtype("float16"))
        cls.add5(alloc70, alloc73, alloc74)
        R.vm.kill_object(alloc70)
        R.vm.kill_object(alloc73)
        model_decoder_layers_4_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[592]
        model_decoder_layers_4_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[593]
        gv124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc75: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv124, R.dtype("float16"))
        cls.layer_norm2(alloc74, model_decoder_layers_4_self_attn_layer_norm_weight2, model_decoder_layers_4_self_attn_layer_norm_bias2, alloc75)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias2)
        model_decoder_layers_4_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[588]
        model_decoder_layers_4_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[589]
        gv125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc76: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv125, R.dtype("float16"))
        _74: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight2, alloc75, model_decoder_layers_4_self_attn_q_proj_bias2, alloc76)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias2)
        gv126: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape427: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc76, gv126, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc76)
        model_decoder_layers_4_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[585]
        gv127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc77: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv127, R.dtype("float16"))
        _75: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight2, alloc75, alloc77)
        R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight2)
        gv128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape428: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc77, gv128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc77)
        model_decoder_layers_4_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[586]
        model_decoder_layers_4_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[587]
        gv129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc78: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv129, R.dtype("float16"))
        _76: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight2, alloc75, model_decoder_layers_4_self_attn_v_proj_bias2, alloc78)
        R.vm.kill_object(alloc75)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias2)
        gv130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape429: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc78, gv130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc78)
        gv131: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc79: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv131, R.dtype("float16"))
        cls.concatenate1(reshape427, reshape428, reshape429, alloc79)
        R.vm.kill_object(reshape427)
        R.vm.kill_object(reshape428)
        R.vm.kill_object(reshape429)
        gv132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape430: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc79, gv132, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc79)
        gv133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc80: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv133, R.dtype("float16"))
        _78: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape430, alloc80)
        R.vm.kill_object(reshape430)
        gv134: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape431: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc80, gv134, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc80)
        gv135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape432: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape431, gv135, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape431)
        model_decoder_layers_4_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[590]
        model_decoder_layers_4_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[591]
        gv136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc81: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv136, R.dtype("float16"))
        _79: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight2, reshape432, model_decoder_layers_4_self_attn_out_proj_bias2, alloc81)
        R.vm.kill_object(reshape432)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias2)
        gv137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc82: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv137, R.dtype("float16"))
        cls.add5(alloc74, alloc81, alloc82)
        R.vm.kill_object(alloc74)
        R.vm.kill_object(alloc81)
        model_decoder_layers_4_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[601]
        model_decoder_layers_4_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[602]
        gv138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc83: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv138, R.dtype("float16"))
        cls.layer_norm2(alloc82, model_decoder_layers_4_encoder_attn_layer_norm_weight2, model_decoder_layers_4_encoder_attn_layer_norm_bias2, alloc83)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias2)
        model_decoder_layers_4_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[597]
        model_decoder_layers_4_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[598]
        gv139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc84: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv139, R.dtype("float16"))
        _82: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight2, alloc83, model_decoder_layers_4_encoder_attn_q_proj_bias2, alloc84)
        R.vm.kill_object(alloc83)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias2)
        gv140: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape433: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc84, gv140, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc84)
        gv141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape434: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape433, gv141, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape433)
        gv142: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc85: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv142, R.dtype("float16"))
        _83: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape434, alloc85)
        R.vm.kill_object(reshape434)
        gv143: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape435: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc85, gv143, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc85)
        gv144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape436: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape435, gv144, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape435)
        model_decoder_layers_4_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[599]
        model_decoder_layers_4_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[600]
        gv145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc86: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv145, R.dtype("float16"))
        _84: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight2, reshape436, model_decoder_layers_4_encoder_attn_out_proj_bias2, alloc86)
        R.vm.kill_object(reshape436)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias2)
        gv146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc87: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv146, R.dtype("float16"))
        cls.add5(alloc82, alloc86, alloc87)
        R.vm.kill_object(alloc82)
        R.vm.kill_object(alloc86)
        model_decoder_layers_4_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[607]
        model_decoder_layers_4_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[608]
        gv147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc88: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv147, R.dtype("float16"))
        cls.layer_norm2(alloc87, model_decoder_layers_4_final_layer_norm_weight2, model_decoder_layers_4_final_layer_norm_bias2, alloc88)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias2)
        model_decoder_layers_4_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[603]
        model_decoder_layers_4_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[604]
        gv148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc89: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv148, R.dtype("float16"))
        _87: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight2, alloc88, model_decoder_layers_4_fc1_bias2, alloc89)
        R.vm.kill_object(alloc88)
        R.vm.kill_object(model_decoder_layers_4_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_4_fc1_bias2)
        model_decoder_layers_4_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[605]
        model_decoder_layers_4_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[606]
        gv149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc90: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv149, R.dtype("float16"))
        _88: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight2, alloc89, model_decoder_layers_4_fc2_bias2, alloc90)
        R.vm.kill_object(alloc89)
        R.vm.kill_object(model_decoder_layers_4_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_4_fc2_bias2)
        gv150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc91: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv150, R.dtype("float16"))
        cls.add5(alloc87, alloc90, alloc91)
        R.vm.kill_object(alloc87)
        R.vm.kill_object(alloc90)
        model_decoder_layers_5_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[616]
        model_decoder_layers_5_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[617]
        gv151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc92: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv151, R.dtype("float16"))
        cls.layer_norm2(alloc91, model_decoder_layers_5_self_attn_layer_norm_weight2, model_decoder_layers_5_self_attn_layer_norm_bias2, alloc92)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias2)
        model_decoder_layers_5_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[612]
        model_decoder_layers_5_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[613]
        gv152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc93: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv152, R.dtype("float16"))
        _91: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight2, alloc92, model_decoder_layers_5_self_attn_q_proj_bias2, alloc93)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias2)
        gv153: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape437: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc93, gv153, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc93)
        model_decoder_layers_5_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[609]
        gv154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc94: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv154, R.dtype("float16"))
        _92: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight2, alloc92, alloc94)
        R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight2)
        gv155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape438: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc94, gv155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc94)
        model_decoder_layers_5_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[610]
        model_decoder_layers_5_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[611]
        gv156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc95: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv156, R.dtype("float16"))
        _93: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight2, alloc92, model_decoder_layers_5_self_attn_v_proj_bias2, alloc95)
        R.vm.kill_object(alloc92)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias2)
        gv157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape439: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc95, gv157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc95)
        gv158: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc96: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv158, R.dtype("float16"))
        cls.concatenate1(reshape437, reshape438, reshape439, alloc96)
        R.vm.kill_object(reshape437)
        R.vm.kill_object(reshape438)
        R.vm.kill_object(reshape439)
        gv159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape440: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc96, gv159, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc96)
        gv160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc97: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv160, R.dtype("float16"))
        _95: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape440, alloc97)
        R.vm.kill_object(reshape440)
        gv161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape441: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc97, gv161, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc97)
        gv162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape442: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape441, gv162, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape441)
        model_decoder_layers_5_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[614]
        model_decoder_layers_5_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[615]
        gv163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc98: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv163, R.dtype("float16"))
        _96: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight2, reshape442, model_decoder_layers_5_self_attn_out_proj_bias2, alloc98)
        R.vm.kill_object(reshape442)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias2)
        gv164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc99: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv164, R.dtype("float16"))
        cls.add5(alloc91, alloc98, alloc99)
        R.vm.kill_object(alloc91)
        R.vm.kill_object(alloc98)
        model_decoder_layers_5_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[625]
        model_decoder_layers_5_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[626]
        gv165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv165, R.dtype("float16"))
        cls.layer_norm2(alloc99, model_decoder_layers_5_encoder_attn_layer_norm_weight2, model_decoder_layers_5_encoder_attn_layer_norm_bias2, alloc100)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias2)
        model_decoder_layers_5_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[621]
        model_decoder_layers_5_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[622]
        gv166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv166, R.dtype("float16"))
        _99: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight2, alloc100, model_decoder_layers_5_encoder_attn_q_proj_bias2, alloc101)
        R.vm.kill_object(alloc100)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias2)
        gv167: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape443: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc101, gv167, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc101)
        gv168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape444: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape443, gv168, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape443)
        gv169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv169, R.dtype("float16"))
        _100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape444, alloc102)
        R.vm.kill_object(reshape444)
        gv170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape445: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc102, gv170, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc102)
        gv171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape446: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape445, gv171, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape445)
        model_decoder_layers_5_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[623]
        model_decoder_layers_5_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[624]
        gv172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv172, R.dtype("float16"))
        _101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight2, reshape446, model_decoder_layers_5_encoder_attn_out_proj_bias2, alloc103)
        R.vm.kill_object(reshape446)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias2)
        gv173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv173, R.dtype("float16"))
        cls.add5(alloc99, alloc103, alloc104)
        R.vm.kill_object(alloc99)
        R.vm.kill_object(alloc103)
        model_decoder_layers_5_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[631]
        model_decoder_layers_5_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[632]
        gv174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv174, R.dtype("float16"))
        cls.layer_norm2(alloc104, model_decoder_layers_5_final_layer_norm_weight2, model_decoder_layers_5_final_layer_norm_bias2, alloc105)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias2)
        model_decoder_layers_5_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[627]
        model_decoder_layers_5_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[628]
        gv175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv175, R.dtype("float16"))
        _104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight2, alloc105, model_decoder_layers_5_fc1_bias2, alloc106)
        R.vm.kill_object(alloc105)
        R.vm.kill_object(model_decoder_layers_5_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_5_fc1_bias2)
        model_decoder_layers_5_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[629]
        model_decoder_layers_5_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[630]
        gv176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv176, R.dtype("float16"))
        _105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight2, alloc106, model_decoder_layers_5_fc2_bias2, alloc107)
        R.vm.kill_object(alloc106)
        R.vm.kill_object(model_decoder_layers_5_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_5_fc2_bias2)
        gv177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv177, R.dtype("float16"))
        cls.add5(alloc104, alloc107, alloc108)
        R.vm.kill_object(alloc104)
        R.vm.kill_object(alloc107)
        model_decoder_layers_6_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[640]
        model_decoder_layers_6_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[641]
        gv178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv178, R.dtype("float16"))
        cls.layer_norm2(alloc108, model_decoder_layers_6_self_attn_layer_norm_weight2, model_decoder_layers_6_self_attn_layer_norm_bias2, alloc109)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias2)
        model_decoder_layers_6_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[636]
        model_decoder_layers_6_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[637]
        gv179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv179, R.dtype("float16"))
        _108: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight2, alloc109, model_decoder_layers_6_self_attn_q_proj_bias2, alloc110)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias2)
        gv180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape447: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc110, gv180, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc110)
        model_decoder_layers_6_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[633]
        gv181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv181, R.dtype("float16"))
        _109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight2, alloc109, alloc111)
        R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight2)
        gv182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape448: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc111, gv182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc111)
        model_decoder_layers_6_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[634]
        model_decoder_layers_6_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[635]
        gv183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv183, R.dtype("float16"))
        _110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight2, alloc109, model_decoder_layers_6_self_attn_v_proj_bias2, alloc112)
        R.vm.kill_object(alloc109)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias2)
        gv184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape449: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc112, gv184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc112)
        gv185: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc113: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv185, R.dtype("float16"))
        cls.concatenate1(reshape447, reshape448, reshape449, alloc113)
        R.vm.kill_object(reshape447)
        R.vm.kill_object(reshape448)
        R.vm.kill_object(reshape449)
        gv186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape450: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc113, gv186, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc113)
        gv187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv187, R.dtype("float16"))
        _112: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape450, alloc114)
        R.vm.kill_object(reshape450)
        gv188: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape451: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc114, gv188, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc114)
        gv189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape452: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape451, gv189, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape451)
        model_decoder_layers_6_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[638]
        model_decoder_layers_6_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[639]
        gv190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv190, R.dtype("float16"))
        _113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight2, reshape452, model_decoder_layers_6_self_attn_out_proj_bias2, alloc115)
        R.vm.kill_object(reshape452)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias2)
        gv191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv191, R.dtype("float16"))
        cls.add5(alloc108, alloc115, alloc116)
        R.vm.kill_object(alloc108)
        R.vm.kill_object(alloc115)
        model_decoder_layers_6_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[649]
        model_decoder_layers_6_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[650]
        gv192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv192, R.dtype("float16"))
        cls.layer_norm2(alloc116, model_decoder_layers_6_encoder_attn_layer_norm_weight2, model_decoder_layers_6_encoder_attn_layer_norm_bias2, alloc117)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias2)
        model_decoder_layers_6_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[645]
        model_decoder_layers_6_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[646]
        gv193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv193, R.dtype("float16"))
        _116: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight2, alloc117, model_decoder_layers_6_encoder_attn_q_proj_bias2, alloc118)
        R.vm.kill_object(alloc117)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias2)
        gv194: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape453: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc118, gv194, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc118)
        gv195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape454: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape453, gv195, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape453)
        gv196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv196, R.dtype("float16"))
        _117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape454, alloc119)
        R.vm.kill_object(reshape454)
        gv197: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape455: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc119, gv197, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc119)
        gv198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape456: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape455, gv198, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape455)
        model_decoder_layers_6_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[647]
        model_decoder_layers_6_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[648]
        gv199: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv199, R.dtype("float16"))
        _118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight2, reshape456, model_decoder_layers_6_encoder_attn_out_proj_bias2, alloc120)
        R.vm.kill_object(reshape456)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias2)
        gv200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv200, R.dtype("float16"))
        cls.add5(alloc116, alloc120, alloc121)
        R.vm.kill_object(alloc116)
        R.vm.kill_object(alloc120)
        model_decoder_layers_6_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[655]
        model_decoder_layers_6_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[656]
        gv201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv201, R.dtype("float16"))
        cls.layer_norm2(alloc121, model_decoder_layers_6_final_layer_norm_weight2, model_decoder_layers_6_final_layer_norm_bias2, alloc122)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias2)
        model_decoder_layers_6_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[651]
        model_decoder_layers_6_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[652]
        gv202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv202, R.dtype("float16"))
        _121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight2, alloc122, model_decoder_layers_6_fc1_bias2, alloc123)
        R.vm.kill_object(alloc122)
        R.vm.kill_object(model_decoder_layers_6_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_6_fc1_bias2)
        model_decoder_layers_6_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[653]
        model_decoder_layers_6_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[654]
        gv203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv203, R.dtype("float16"))
        _122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight2, alloc123, model_decoder_layers_6_fc2_bias2, alloc124)
        R.vm.kill_object(alloc123)
        R.vm.kill_object(model_decoder_layers_6_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_6_fc2_bias2)
        gv204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv204, R.dtype("float16"))
        cls.add5(alloc121, alloc124, alloc125)
        R.vm.kill_object(alloc121)
        R.vm.kill_object(alloc124)
        model_decoder_layers_7_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[664]
        model_decoder_layers_7_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[665]
        gv205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv205, R.dtype("float16"))
        cls.layer_norm2(alloc125, model_decoder_layers_7_self_attn_layer_norm_weight2, model_decoder_layers_7_self_attn_layer_norm_bias2, alloc126)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias2)
        model_decoder_layers_7_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[660]
        model_decoder_layers_7_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[661]
        gv206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv206, R.dtype("float16"))
        _125: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight2, alloc126, model_decoder_layers_7_self_attn_q_proj_bias2, alloc127)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias2)
        gv207: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape457: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc127, gv207, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc127)
        model_decoder_layers_7_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[657]
        gv208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv208, R.dtype("float16"))
        _126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight2, alloc126, alloc128)
        R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight2)
        gv209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape458: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc128, gv209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc128)
        model_decoder_layers_7_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[658]
        model_decoder_layers_7_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[659]
        gv210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv210, R.dtype("float16"))
        _127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight2, alloc126, model_decoder_layers_7_self_attn_v_proj_bias2, alloc129)
        R.vm.kill_object(alloc126)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias2)
        gv211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape459: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc129, gv211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc129)
        gv212: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc130: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv212, R.dtype("float16"))
        cls.concatenate1(reshape457, reshape458, reshape459, alloc130)
        R.vm.kill_object(reshape457)
        R.vm.kill_object(reshape458)
        R.vm.kill_object(reshape459)
        gv213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape460: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc130, gv213, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc130)
        gv214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv214, R.dtype("float16"))
        _129: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape460, alloc131)
        R.vm.kill_object(reshape460)
        gv215: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape461: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc131, gv215, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc131)
        gv216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape462: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape461, gv216, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape461)
        model_decoder_layers_7_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[662]
        model_decoder_layers_7_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[663]
        gv217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv217, R.dtype("float16"))
        _130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight2, reshape462, model_decoder_layers_7_self_attn_out_proj_bias2, alloc132)
        R.vm.kill_object(reshape462)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias2)
        gv218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv218, R.dtype("float16"))
        cls.add5(alloc125, alloc132, alloc133)
        R.vm.kill_object(alloc125)
        R.vm.kill_object(alloc132)
        model_decoder_layers_7_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[673]
        model_decoder_layers_7_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[674]
        gv219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv219, R.dtype("float16"))
        cls.layer_norm2(alloc133, model_decoder_layers_7_encoder_attn_layer_norm_weight2, model_decoder_layers_7_encoder_attn_layer_norm_bias2, alloc134)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias2)
        model_decoder_layers_7_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[669]
        model_decoder_layers_7_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[670]
        gv220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv220, R.dtype("float16"))
        _133: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight2, alloc134, model_decoder_layers_7_encoder_attn_q_proj_bias2, alloc135)
        R.vm.kill_object(alloc134)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias2)
        gv221: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape463: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc135, gv221, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc135)
        gv222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape464: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape463, gv222, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape463)
        gv223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv223, R.dtype("float16"))
        _134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape464, alloc136)
        R.vm.kill_object(reshape464)
        gv224: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape465: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc136, gv224, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc136)
        gv225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape466: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape465, gv225, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape465)
        model_decoder_layers_7_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[671]
        model_decoder_layers_7_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[672]
        gv226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv226, R.dtype("float16"))
        _135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight2, reshape466, model_decoder_layers_7_encoder_attn_out_proj_bias2, alloc137)
        R.vm.kill_object(reshape466)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias2)
        gv227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv227, R.dtype("float16"))
        cls.add5(alloc133, alloc137, alloc138)
        R.vm.kill_object(alloc133)
        R.vm.kill_object(alloc137)
        model_decoder_layers_7_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[679]
        model_decoder_layers_7_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[680]
        gv228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv228, R.dtype("float16"))
        cls.layer_norm2(alloc138, model_decoder_layers_7_final_layer_norm_weight2, model_decoder_layers_7_final_layer_norm_bias2, alloc139)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias2)
        model_decoder_layers_7_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[675]
        model_decoder_layers_7_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[676]
        gv229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv229, R.dtype("float16"))
        _138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight2, alloc139, model_decoder_layers_7_fc1_bias2, alloc140)
        R.vm.kill_object(alloc139)
        R.vm.kill_object(model_decoder_layers_7_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_7_fc1_bias2)
        model_decoder_layers_7_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[677]
        model_decoder_layers_7_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[678]
        gv230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv230, R.dtype("float16"))
        _139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight2, alloc140, model_decoder_layers_7_fc2_bias2, alloc141)
        R.vm.kill_object(alloc140)
        R.vm.kill_object(model_decoder_layers_7_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_7_fc2_bias2)
        gv231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv231, R.dtype("float16"))
        cls.add5(alloc138, alloc141, alloc142)
        R.vm.kill_object(alloc138)
        R.vm.kill_object(alloc141)
        model_decoder_layers_8_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[688]
        model_decoder_layers_8_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[689]
        gv232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv232, R.dtype("float16"))
        cls.layer_norm2(alloc142, model_decoder_layers_8_self_attn_layer_norm_weight2, model_decoder_layers_8_self_attn_layer_norm_bias2, alloc143)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias2)
        model_decoder_layers_8_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[684]
        model_decoder_layers_8_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[685]
        gv233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv233, R.dtype("float16"))
        _142: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight2, alloc143, model_decoder_layers_8_self_attn_q_proj_bias2, alloc144)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias2)
        gv234: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape467: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc144, gv234, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc144)
        model_decoder_layers_8_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[681]
        gv235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv235, R.dtype("float16"))
        _143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight2, alloc143, alloc145)
        R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight2)
        gv236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape468: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc145, gv236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc145)
        model_decoder_layers_8_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[682]
        model_decoder_layers_8_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[683]
        gv237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv237, R.dtype("float16"))
        _144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight2, alloc143, model_decoder_layers_8_self_attn_v_proj_bias2, alloc146)
        R.vm.kill_object(alloc143)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias2)
        gv238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape469: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc146, gv238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc146)
        gv239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc147: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv239, R.dtype("float16"))
        cls.concatenate1(reshape467, reshape468, reshape469, alloc147)
        R.vm.kill_object(reshape467)
        R.vm.kill_object(reshape468)
        R.vm.kill_object(reshape469)
        gv240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape470: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc147, gv240, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc147)
        gv241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv241, R.dtype("float16"))
        _146: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape470, alloc148)
        R.vm.kill_object(reshape470)
        gv242: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape471: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc148, gv242, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc148)
        gv243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape472: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape471, gv243, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape471)
        model_decoder_layers_8_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[686]
        model_decoder_layers_8_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[687]
        gv244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv244, R.dtype("float16"))
        _147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight2, reshape472, model_decoder_layers_8_self_attn_out_proj_bias2, alloc149)
        R.vm.kill_object(reshape472)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias2)
        gv245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv245, R.dtype("float16"))
        cls.add5(alloc142, alloc149, alloc150)
        R.vm.kill_object(alloc142)
        R.vm.kill_object(alloc149)
        model_decoder_layers_8_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[697]
        model_decoder_layers_8_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[698]
        gv246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv246, R.dtype("float16"))
        cls.layer_norm2(alloc150, model_decoder_layers_8_encoder_attn_layer_norm_weight2, model_decoder_layers_8_encoder_attn_layer_norm_bias2, alloc151)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias2)
        model_decoder_layers_8_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[693]
        model_decoder_layers_8_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[694]
        gv247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv247, R.dtype("float16"))
        _150: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight2, alloc151, model_decoder_layers_8_encoder_attn_q_proj_bias2, alloc152)
        R.vm.kill_object(alloc151)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias2)
        gv248: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape473: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc152, gv248, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc152)
        gv249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape474: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape473, gv249, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape473)
        gv250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv250, R.dtype("float16"))
        _151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape474, alloc153)
        R.vm.kill_object(reshape474)
        gv251: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape475: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc153, gv251, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc153)
        gv252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape476: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape475, gv252, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape475)
        model_decoder_layers_8_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[695]
        model_decoder_layers_8_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[696]
        gv253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv253, R.dtype("float16"))
        _152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight2, reshape476, model_decoder_layers_8_encoder_attn_out_proj_bias2, alloc154)
        R.vm.kill_object(reshape476)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias2)
        gv254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv254, R.dtype("float16"))
        cls.add5(alloc150, alloc154, alloc155)
        R.vm.kill_object(alloc150)
        R.vm.kill_object(alloc154)
        model_decoder_layers_8_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[703]
        model_decoder_layers_8_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[704]
        gv255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv255, R.dtype("float16"))
        cls.layer_norm2(alloc155, model_decoder_layers_8_final_layer_norm_weight2, model_decoder_layers_8_final_layer_norm_bias2, alloc156)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias2)
        model_decoder_layers_8_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[699]
        model_decoder_layers_8_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[700]
        gv256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv256, R.dtype("float16"))
        _155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight2, alloc156, model_decoder_layers_8_fc1_bias2, alloc157)
        R.vm.kill_object(alloc156)
        R.vm.kill_object(model_decoder_layers_8_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_8_fc1_bias2)
        model_decoder_layers_8_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[701]
        model_decoder_layers_8_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[702]
        gv257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv257, R.dtype("float16"))
        _156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight2, alloc157, model_decoder_layers_8_fc2_bias2, alloc158)
        R.vm.kill_object(alloc157)
        R.vm.kill_object(model_decoder_layers_8_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_8_fc2_bias2)
        gv258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv258, R.dtype("float16"))
        cls.add5(alloc155, alloc158, alloc159)
        R.vm.kill_object(alloc155)
        R.vm.kill_object(alloc158)
        model_decoder_layers_9_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[712]
        model_decoder_layers_9_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[713]
        gv259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv259, R.dtype("float16"))
        cls.layer_norm2(alloc159, model_decoder_layers_9_self_attn_layer_norm_weight2, model_decoder_layers_9_self_attn_layer_norm_bias2, alloc160)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias2)
        model_decoder_layers_9_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[708]
        model_decoder_layers_9_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[709]
        gv260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv260, R.dtype("float16"))
        _159: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight2, alloc160, model_decoder_layers_9_self_attn_q_proj_bias2, alloc161)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias2)
        gv261: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape477: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc161, gv261, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc161)
        model_decoder_layers_9_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[705]
        gv262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv262, R.dtype("float16"))
        _160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight2, alloc160, alloc162)
        R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight2)
        gv263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape478: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc162, gv263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc162)
        model_decoder_layers_9_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[706]
        model_decoder_layers_9_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[707]
        gv264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv264, R.dtype("float16"))
        _161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight2, alloc160, model_decoder_layers_9_self_attn_v_proj_bias2, alloc163)
        R.vm.kill_object(alloc160)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias2)
        gv265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape479: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc163, gv265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc163)
        gv266: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc164: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv266, R.dtype("float16"))
        cls.concatenate1(reshape477, reshape478, reshape479, alloc164)
        R.vm.kill_object(reshape477)
        R.vm.kill_object(reshape478)
        R.vm.kill_object(reshape479)
        gv267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape480: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc164, gv267, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc164)
        gv268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv268, R.dtype("float16"))
        _163: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape480, alloc165)
        R.vm.kill_object(reshape480)
        gv269: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape481: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc165, gv269, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc165)
        gv270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape482: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape481, gv270, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape481)
        model_decoder_layers_9_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[710]
        model_decoder_layers_9_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[711]
        gv271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv271, R.dtype("float16"))
        _164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight2, reshape482, model_decoder_layers_9_self_attn_out_proj_bias2, alloc166)
        R.vm.kill_object(reshape482)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias2)
        gv272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv272, R.dtype("float16"))
        cls.add5(alloc159, alloc166, alloc167)
        R.vm.kill_object(alloc159)
        R.vm.kill_object(alloc166)
        model_decoder_layers_9_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[721]
        model_decoder_layers_9_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[722]
        gv273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv273, R.dtype("float16"))
        cls.layer_norm2(alloc167, model_decoder_layers_9_encoder_attn_layer_norm_weight2, model_decoder_layers_9_encoder_attn_layer_norm_bias2, alloc168)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias2)
        model_decoder_layers_9_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[717]
        model_decoder_layers_9_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[718]
        gv274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv274, R.dtype("float16"))
        _167: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight2, alloc168, model_decoder_layers_9_encoder_attn_q_proj_bias2, alloc169)
        R.vm.kill_object(alloc168)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias2)
        gv275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape483: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc169, gv275, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc169)
        gv276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape484: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape483, gv276, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape483)
        gv277: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv277, R.dtype("float16"))
        _168: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape484, alloc170)
        R.vm.kill_object(reshape484)
        gv278: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape485: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc170, gv278, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc170)
        gv279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape486: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape485, gv279, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape485)
        model_decoder_layers_9_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[719]
        model_decoder_layers_9_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[720]
        gv280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv280, R.dtype("float16"))
        _169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight2, reshape486, model_decoder_layers_9_encoder_attn_out_proj_bias2, alloc171)
        R.vm.kill_object(reshape486)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias2)
        gv281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv281, R.dtype("float16"))
        cls.add5(alloc167, alloc171, alloc172)
        R.vm.kill_object(alloc167)
        R.vm.kill_object(alloc171)
        model_decoder_layers_9_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[727]
        model_decoder_layers_9_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[728]
        gv282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv282, R.dtype("float16"))
        cls.layer_norm2(alloc172, model_decoder_layers_9_final_layer_norm_weight2, model_decoder_layers_9_final_layer_norm_bias2, alloc173)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias2)
        model_decoder_layers_9_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[723]
        model_decoder_layers_9_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[724]
        gv283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv283, R.dtype("float16"))
        _172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight2, alloc173, model_decoder_layers_9_fc1_bias2, alloc174)
        R.vm.kill_object(alloc173)
        R.vm.kill_object(model_decoder_layers_9_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_9_fc1_bias2)
        model_decoder_layers_9_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[725]
        model_decoder_layers_9_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[726]
        gv284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv284, R.dtype("float16"))
        _173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight2, alloc174, model_decoder_layers_9_fc2_bias2, alloc175)
        R.vm.kill_object(alloc174)
        R.vm.kill_object(model_decoder_layers_9_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_9_fc2_bias2)
        gv285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc176: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv285, R.dtype("float16"))
        cls.add5(alloc172, alloc175, alloc176)
        R.vm.kill_object(alloc172)
        R.vm.kill_object(alloc175)
        model_decoder_layers_10_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[736]
        model_decoder_layers_10_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[737]
        gv286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv286, R.dtype("float16"))
        cls.layer_norm2(alloc176, model_decoder_layers_10_self_attn_layer_norm_weight2, model_decoder_layers_10_self_attn_layer_norm_bias2, alloc177)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias2)
        model_decoder_layers_10_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[732]
        model_decoder_layers_10_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[733]
        gv287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv287, R.dtype("float16"))
        _176: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight2, alloc177, model_decoder_layers_10_self_attn_q_proj_bias2, alloc178)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias2)
        gv288: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape487: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc178, gv288, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc178)
        model_decoder_layers_10_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[729]
        gv289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv289, R.dtype("float16"))
        _177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight2, alloc177, alloc179)
        R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight2)
        gv290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape488: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc179, gv290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc179)
        model_decoder_layers_10_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[730]
        model_decoder_layers_10_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[731]
        gv291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv291, R.dtype("float16"))
        _178: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight2, alloc177, model_decoder_layers_10_self_attn_v_proj_bias2, alloc180)
        R.vm.kill_object(alloc177)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias2)
        gv292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape489: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc180, gv292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc180)
        gv293: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc181: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv293, R.dtype("float16"))
        cls.concatenate1(reshape487, reshape488, reshape489, alloc181)
        R.vm.kill_object(reshape487)
        R.vm.kill_object(reshape488)
        R.vm.kill_object(reshape489)
        gv294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape490: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc181, gv294, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc181)
        gv295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv295, R.dtype("float16"))
        _180: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape490, alloc182)
        R.vm.kill_object(reshape490)
        gv296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape491: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc182, gv296, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc182)
        gv297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape492: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape491, gv297, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape491)
        model_decoder_layers_10_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[734]
        model_decoder_layers_10_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[735]
        gv298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv298, R.dtype("float16"))
        _181: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight2, reshape492, model_decoder_layers_10_self_attn_out_proj_bias2, alloc183)
        R.vm.kill_object(reshape492)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias2)
        gv299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv299, R.dtype("float16"))
        cls.add5(alloc176, alloc183, alloc184)
        R.vm.kill_object(alloc176)
        R.vm.kill_object(alloc183)
        model_decoder_layers_10_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[745]
        model_decoder_layers_10_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[746]
        gv300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv300, R.dtype("float16"))
        cls.layer_norm2(alloc184, model_decoder_layers_10_encoder_attn_layer_norm_weight2, model_decoder_layers_10_encoder_attn_layer_norm_bias2, alloc185)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias2)
        model_decoder_layers_10_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[741]
        model_decoder_layers_10_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[742]
        gv301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv301, R.dtype("float16"))
        _184: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight2, alloc185, model_decoder_layers_10_encoder_attn_q_proj_bias2, alloc186)
        R.vm.kill_object(alloc185)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias2)
        gv302: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape493: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc186, gv302, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc186)
        gv303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape494: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape493, gv303, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape493)
        gv304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv304, R.dtype("float16"))
        _185: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape494, alloc187)
        R.vm.kill_object(reshape494)
        gv305: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape495: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc187, gv305, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc187)
        gv306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape496: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape495, gv306, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape495)
        model_decoder_layers_10_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[743]
        model_decoder_layers_10_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[744]
        gv307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv307, R.dtype("float16"))
        _186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight2, reshape496, model_decoder_layers_10_encoder_attn_out_proj_bias2, alloc188)
        R.vm.kill_object(reshape496)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias2)
        gv308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv308, R.dtype("float16"))
        cls.add5(alloc184, alloc188, alloc189)
        R.vm.kill_object(alloc184)
        R.vm.kill_object(alloc188)
        model_decoder_layers_10_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[751]
        model_decoder_layers_10_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[752]
        gv309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv309, R.dtype("float16"))
        cls.layer_norm2(alloc189, model_decoder_layers_10_final_layer_norm_weight2, model_decoder_layers_10_final_layer_norm_bias2, alloc190)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias2)
        model_decoder_layers_10_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[747]
        model_decoder_layers_10_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[748]
        gv310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv310, R.dtype("float16"))
        _189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight2, alloc190, model_decoder_layers_10_fc1_bias2, alloc191)
        R.vm.kill_object(alloc190)
        R.vm.kill_object(model_decoder_layers_10_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_10_fc1_bias2)
        model_decoder_layers_10_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[749]
        model_decoder_layers_10_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[750]
        gv311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv311, R.dtype("float16"))
        _190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight2, alloc191, model_decoder_layers_10_fc2_bias2, alloc192)
        R.vm.kill_object(alloc191)
        R.vm.kill_object(model_decoder_layers_10_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_10_fc2_bias2)
        gv312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc193: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv312, R.dtype("float16"))
        cls.add5(alloc189, alloc192, alloc193)
        R.vm.kill_object(alloc189)
        R.vm.kill_object(alloc192)
        model_decoder_layers_11_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[760]
        model_decoder_layers_11_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[761]
        gv313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv313, R.dtype("float16"))
        cls.layer_norm2(alloc193, model_decoder_layers_11_self_attn_layer_norm_weight2, model_decoder_layers_11_self_attn_layer_norm_bias2, alloc194)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias2)
        model_decoder_layers_11_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[756]
        model_decoder_layers_11_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[757]
        gv314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv314, R.dtype("float16"))
        _193: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight2, alloc194, model_decoder_layers_11_self_attn_q_proj_bias2, alloc195)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias2)
        gv315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape497: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc195, gv315, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc195)
        model_decoder_layers_11_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[753]
        gv316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv316, R.dtype("float16"))
        _194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight2, alloc194, alloc196)
        R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight2)
        gv317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape498: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc196, gv317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc196)
        model_decoder_layers_11_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[754]
        model_decoder_layers_11_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[755]
        gv318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv318, R.dtype("float16"))
        _195: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight2, alloc194, model_decoder_layers_11_self_attn_v_proj_bias2, alloc197)
        R.vm.kill_object(alloc194)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias2)
        gv319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape499: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc197, gv319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc197)
        gv320: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc198: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv320, R.dtype("float16"))
        cls.concatenate1(reshape497, reshape498, reshape499, alloc198)
        R.vm.kill_object(reshape497)
        R.vm.kill_object(reshape498)
        R.vm.kill_object(reshape499)
        gv321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape500: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc198, gv321, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc198)
        gv322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv322, R.dtype("float16"))
        _197: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape500, alloc199)
        R.vm.kill_object(reshape500)
        gv323: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape501: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc199, gv323, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc199)
        gv324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape502: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape501, gv324, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape501)
        model_decoder_layers_11_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[758]
        model_decoder_layers_11_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[759]
        gv325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv325, R.dtype("float16"))
        _198: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight2, reshape502, model_decoder_layers_11_self_attn_out_proj_bias2, alloc200)
        R.vm.kill_object(reshape502)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias2)
        gv326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv326, R.dtype("float16"))
        cls.add5(alloc193, alloc200, alloc201)
        R.vm.kill_object(alloc193)
        R.vm.kill_object(alloc200)
        model_decoder_layers_11_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[769]
        model_decoder_layers_11_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[770]
        gv327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv327, R.dtype("float16"))
        cls.layer_norm2(alloc201, model_decoder_layers_11_encoder_attn_layer_norm_weight2, model_decoder_layers_11_encoder_attn_layer_norm_bias2, alloc202)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias2)
        model_decoder_layers_11_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[765]
        model_decoder_layers_11_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[766]
        gv328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv328, R.dtype("float16"))
        _201: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight2, alloc202, model_decoder_layers_11_encoder_attn_q_proj_bias2, alloc203)
        R.vm.kill_object(alloc202)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias2)
        gv329: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape503: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc203, gv329, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc203)
        gv330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape504: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape503, gv330, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape503)
        gv331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv331, R.dtype("float16"))
        _202: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape504, alloc204)
        R.vm.kill_object(reshape504)
        gv332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape505: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc204, gv332, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc204)
        gv333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape506: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape505, gv333, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape505)
        model_decoder_layers_11_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[767]
        model_decoder_layers_11_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[768]
        gv334: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv334, R.dtype("float16"))
        _203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight2, reshape506, model_decoder_layers_11_encoder_attn_out_proj_bias2, alloc205)
        R.vm.kill_object(reshape506)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias2)
        gv335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv335, R.dtype("float16"))
        cls.add5(alloc201, alloc205, alloc206)
        R.vm.kill_object(alloc201)
        R.vm.kill_object(alloc205)
        model_decoder_layers_11_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[775]
        model_decoder_layers_11_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[776]
        gv336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv336, R.dtype("float16"))
        cls.layer_norm2(alloc206, model_decoder_layers_11_final_layer_norm_weight2, model_decoder_layers_11_final_layer_norm_bias2, alloc207)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias2)
        model_decoder_layers_11_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[771]
        model_decoder_layers_11_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[772]
        gv337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv337, R.dtype("float16"))
        _206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight2, alloc207, model_decoder_layers_11_fc1_bias2, alloc208)
        R.vm.kill_object(alloc207)
        R.vm.kill_object(model_decoder_layers_11_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_11_fc1_bias2)
        model_decoder_layers_11_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[773]
        model_decoder_layers_11_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[774]
        gv338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv338, R.dtype("float16"))
        _207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight2, alloc208, model_decoder_layers_11_fc2_bias2, alloc209)
        R.vm.kill_object(alloc208)
        R.vm.kill_object(model_decoder_layers_11_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_11_fc2_bias2)
        gv339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc210: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv339, R.dtype("float16"))
        cls.add5(alloc206, alloc209, alloc210)
        R.vm.kill_object(alloc206)
        R.vm.kill_object(alloc209)
        model_decoder_layers_12_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[784]
        model_decoder_layers_12_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[785]
        gv340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv340, R.dtype("float16"))
        cls.layer_norm2(alloc210, model_decoder_layers_12_self_attn_layer_norm_weight2, model_decoder_layers_12_self_attn_layer_norm_bias2, alloc211)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias2)
        model_decoder_layers_12_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[780]
        model_decoder_layers_12_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[781]
        gv341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv341, R.dtype("float16"))
        _210: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight2, alloc211, model_decoder_layers_12_self_attn_q_proj_bias2, alloc212)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias2)
        gv342: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape507: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc212, gv342, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc212)
        model_decoder_layers_12_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[777]
        gv343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv343, R.dtype("float16"))
        _211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight2, alloc211, alloc213)
        R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight2)
        gv344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape508: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc213, gv344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc213)
        model_decoder_layers_12_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[778]
        model_decoder_layers_12_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[779]
        gv345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv345, R.dtype("float16"))
        _212: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight2, alloc211, model_decoder_layers_12_self_attn_v_proj_bias2, alloc214)
        R.vm.kill_object(alloc211)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias2)
        gv346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape509: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc214, gv346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc214)
        gv347: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc215: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv347, R.dtype("float16"))
        cls.concatenate1(reshape507, reshape508, reshape509, alloc215)
        R.vm.kill_object(reshape507)
        R.vm.kill_object(reshape508)
        R.vm.kill_object(reshape509)
        gv348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape510: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc215, gv348, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc215)
        gv349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv349, R.dtype("float16"))
        _214: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape510, alloc216)
        R.vm.kill_object(reshape510)
        gv350: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape511: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc216, gv350, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc216)
        gv351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape512: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape511, gv351, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape511)
        model_decoder_layers_12_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[782]
        model_decoder_layers_12_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[783]
        gv352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv352, R.dtype("float16"))
        _215: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight2, reshape512, model_decoder_layers_12_self_attn_out_proj_bias2, alloc217)
        R.vm.kill_object(reshape512)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias2)
        gv353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv353, R.dtype("float16"))
        cls.add5(alloc210, alloc217, alloc218)
        R.vm.kill_object(alloc210)
        R.vm.kill_object(alloc217)
        model_decoder_layers_12_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[793]
        model_decoder_layers_12_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[794]
        gv354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv354, R.dtype("float16"))
        cls.layer_norm2(alloc218, model_decoder_layers_12_encoder_attn_layer_norm_weight2, model_decoder_layers_12_encoder_attn_layer_norm_bias2, alloc219)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias2)
        model_decoder_layers_12_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[789]
        model_decoder_layers_12_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[790]
        gv355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv355, R.dtype("float16"))
        _218: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight2, alloc219, model_decoder_layers_12_encoder_attn_q_proj_bias2, alloc220)
        R.vm.kill_object(alloc219)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias2)
        gv356: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape513: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc220, gv356, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc220)
        gv357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape514: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape513, gv357, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape513)
        gv358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv358, R.dtype("float16"))
        _219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape514, alloc221)
        R.vm.kill_object(reshape514)
        gv359: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape515: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc221, gv359, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc221)
        gv360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape516: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape515, gv360, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape515)
        model_decoder_layers_12_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[791]
        model_decoder_layers_12_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[792]
        gv361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv361, R.dtype("float16"))
        _220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight2, reshape516, model_decoder_layers_12_encoder_attn_out_proj_bias2, alloc222)
        R.vm.kill_object(reshape516)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias2)
        gv362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv362, R.dtype("float16"))
        cls.add5(alloc218, alloc222, alloc223)
        R.vm.kill_object(alloc218)
        R.vm.kill_object(alloc222)
        model_decoder_layers_12_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[799]
        model_decoder_layers_12_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[800]
        gv363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv363, R.dtype("float16"))
        cls.layer_norm2(alloc223, model_decoder_layers_12_final_layer_norm_weight2, model_decoder_layers_12_final_layer_norm_bias2, alloc224)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias2)
        model_decoder_layers_12_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[795]
        model_decoder_layers_12_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[796]
        gv364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv364, R.dtype("float16"))
        _223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight2, alloc224, model_decoder_layers_12_fc1_bias2, alloc225)
        R.vm.kill_object(alloc224)
        R.vm.kill_object(model_decoder_layers_12_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_12_fc1_bias2)
        model_decoder_layers_12_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[797]
        model_decoder_layers_12_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[798]
        gv365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv365, R.dtype("float16"))
        _224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight2, alloc225, model_decoder_layers_12_fc2_bias2, alloc226)
        R.vm.kill_object(alloc225)
        R.vm.kill_object(model_decoder_layers_12_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_12_fc2_bias2)
        gv366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc227: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv366, R.dtype("float16"))
        cls.add5(alloc223, alloc226, alloc227)
        R.vm.kill_object(alloc223)
        R.vm.kill_object(alloc226)
        model_decoder_layers_13_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[808]
        model_decoder_layers_13_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[809]
        gv367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv367, R.dtype("float16"))
        cls.layer_norm2(alloc227, model_decoder_layers_13_self_attn_layer_norm_weight2, model_decoder_layers_13_self_attn_layer_norm_bias2, alloc228)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias2)
        model_decoder_layers_13_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[804]
        model_decoder_layers_13_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[805]
        gv368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv368, R.dtype("float16"))
        _227: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight2, alloc228, model_decoder_layers_13_self_attn_q_proj_bias2, alloc229)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias2)
        gv369: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape517: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc229, gv369, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc229)
        model_decoder_layers_13_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[801]
        gv370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv370, R.dtype("float16"))
        _228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight2, alloc228, alloc230)
        R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight2)
        gv371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape518: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc230, gv371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc230)
        model_decoder_layers_13_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[802]
        model_decoder_layers_13_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[803]
        gv372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv372, R.dtype("float16"))
        _229: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight2, alloc228, model_decoder_layers_13_self_attn_v_proj_bias2, alloc231)
        R.vm.kill_object(alloc228)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias2)
        gv373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape519: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc231, gv373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc231)
        gv374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc232: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv374, R.dtype("float16"))
        cls.concatenate1(reshape517, reshape518, reshape519, alloc232)
        R.vm.kill_object(reshape517)
        R.vm.kill_object(reshape518)
        R.vm.kill_object(reshape519)
        gv375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape520: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc232, gv375, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc232)
        gv376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv376, R.dtype("float16"))
        _231: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape520, alloc233)
        R.vm.kill_object(reshape520)
        gv377: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape521: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc233, gv377, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc233)
        gv378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape522: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape521, gv378, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape521)
        model_decoder_layers_13_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[806]
        model_decoder_layers_13_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[807]
        gv379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv379, R.dtype("float16"))
        _232: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight2, reshape522, model_decoder_layers_13_self_attn_out_proj_bias2, alloc234)
        R.vm.kill_object(reshape522)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias2)
        gv380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv380, R.dtype("float16"))
        cls.add5(alloc227, alloc234, alloc235)
        R.vm.kill_object(alloc227)
        R.vm.kill_object(alloc234)
        model_decoder_layers_13_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[817]
        model_decoder_layers_13_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[818]
        gv381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv381, R.dtype("float16"))
        cls.layer_norm2(alloc235, model_decoder_layers_13_encoder_attn_layer_norm_weight2, model_decoder_layers_13_encoder_attn_layer_norm_bias2, alloc236)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias2)
        model_decoder_layers_13_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[813]
        model_decoder_layers_13_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[814]
        gv382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv382, R.dtype("float16"))
        _235: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight2, alloc236, model_decoder_layers_13_encoder_attn_q_proj_bias2, alloc237)
        R.vm.kill_object(alloc236)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias2)
        gv383: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape523: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc237, gv383, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc237)
        gv384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape524: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape523, gv384, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape523)
        gv385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv385, R.dtype("float16"))
        _236: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape524, alloc238)
        R.vm.kill_object(reshape524)
        gv386: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape525: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc238, gv386, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc238)
        gv387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape526: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape525, gv387, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape525)
        model_decoder_layers_13_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[815]
        model_decoder_layers_13_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[816]
        gv388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv388, R.dtype("float16"))
        _237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight2, reshape526, model_decoder_layers_13_encoder_attn_out_proj_bias2, alloc239)
        R.vm.kill_object(reshape526)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias2)
        gv389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv389, R.dtype("float16"))
        cls.add5(alloc235, alloc239, alloc240)
        R.vm.kill_object(alloc235)
        R.vm.kill_object(alloc239)
        model_decoder_layers_13_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[823]
        model_decoder_layers_13_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[824]
        gv390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv390, R.dtype("float16"))
        cls.layer_norm2(alloc240, model_decoder_layers_13_final_layer_norm_weight2, model_decoder_layers_13_final_layer_norm_bias2, alloc241)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias2)
        model_decoder_layers_13_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[819]
        model_decoder_layers_13_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[820]
        gv391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv391, R.dtype("float16"))
        _240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight2, alloc241, model_decoder_layers_13_fc1_bias2, alloc242)
        R.vm.kill_object(alloc241)
        R.vm.kill_object(model_decoder_layers_13_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_13_fc1_bias2)
        model_decoder_layers_13_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[821]
        model_decoder_layers_13_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[822]
        gv392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv392, R.dtype("float16"))
        _241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight2, alloc242, model_decoder_layers_13_fc2_bias2, alloc243)
        R.vm.kill_object(alloc242)
        R.vm.kill_object(model_decoder_layers_13_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_13_fc2_bias2)
        gv393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc244: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv393, R.dtype("float16"))
        cls.add5(alloc240, alloc243, alloc244)
        R.vm.kill_object(alloc240)
        R.vm.kill_object(alloc243)
        model_decoder_layers_14_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[832]
        model_decoder_layers_14_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[833]
        gv394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv394, R.dtype("float16"))
        cls.layer_norm2(alloc244, model_decoder_layers_14_self_attn_layer_norm_weight2, model_decoder_layers_14_self_attn_layer_norm_bias2, alloc245)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias2)
        model_decoder_layers_14_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[828]
        model_decoder_layers_14_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[829]
        gv395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv395, R.dtype("float16"))
        _244: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight2, alloc245, model_decoder_layers_14_self_attn_q_proj_bias2, alloc246)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias2)
        gv396: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape527: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc246, gv396, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc246)
        model_decoder_layers_14_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[825]
        gv397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv397, R.dtype("float16"))
        _245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight2, alloc245, alloc247)
        R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight2)
        gv398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape528: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc247, gv398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc247)
        model_decoder_layers_14_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[826]
        model_decoder_layers_14_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[827]
        gv399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv399, R.dtype("float16"))
        _246: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight2, alloc245, model_decoder_layers_14_self_attn_v_proj_bias2, alloc248)
        R.vm.kill_object(alloc245)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias2)
        gv400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape529: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc248, gv400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc248)
        gv401: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc249: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv401, R.dtype("float16"))
        cls.concatenate1(reshape527, reshape528, reshape529, alloc249)
        R.vm.kill_object(reshape527)
        R.vm.kill_object(reshape528)
        R.vm.kill_object(reshape529)
        gv402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape530: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc249, gv402, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc249)
        gv403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv403, R.dtype("float16"))
        _248: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape530, alloc250)
        R.vm.kill_object(reshape530)
        gv404: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape531: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc250, gv404, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc250)
        gv405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape532: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape531, gv405, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape531)
        model_decoder_layers_14_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[830]
        model_decoder_layers_14_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[831]
        gv406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv406, R.dtype("float16"))
        _249: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight2, reshape532, model_decoder_layers_14_self_attn_out_proj_bias2, alloc251)
        R.vm.kill_object(reshape532)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias2)
        gv407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv407, R.dtype("float16"))
        cls.add5(alloc244, alloc251, alloc252)
        R.vm.kill_object(alloc244)
        R.vm.kill_object(alloc251)
        model_decoder_layers_14_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[841]
        model_decoder_layers_14_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[842]
        gv408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv408, R.dtype("float16"))
        cls.layer_norm2(alloc252, model_decoder_layers_14_encoder_attn_layer_norm_weight2, model_decoder_layers_14_encoder_attn_layer_norm_bias2, alloc253)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias2)
        model_decoder_layers_14_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[837]
        model_decoder_layers_14_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[838]
        gv409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv409, R.dtype("float16"))
        _252: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight2, alloc253, model_decoder_layers_14_encoder_attn_q_proj_bias2, alloc254)
        R.vm.kill_object(alloc253)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias2)
        gv410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape533: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc254, gv410, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc254)
        gv411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape534: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape533, gv411, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape533)
        gv412: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv412, R.dtype("float16"))
        _253: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape534, alloc255)
        R.vm.kill_object(reshape534)
        gv413: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape535: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc255, gv413, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc255)
        gv414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape536: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape535, gv414, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape535)
        model_decoder_layers_14_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[839]
        model_decoder_layers_14_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[840]
        gv415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv415, R.dtype("float16"))
        _254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight2, reshape536, model_decoder_layers_14_encoder_attn_out_proj_bias2, alloc256)
        R.vm.kill_object(reshape536)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias2)
        gv416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv416, R.dtype("float16"))
        cls.add5(alloc252, alloc256, alloc257)
        R.vm.kill_object(alloc252)
        R.vm.kill_object(alloc256)
        model_decoder_layers_14_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[847]
        model_decoder_layers_14_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[848]
        gv417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv417, R.dtype("float16"))
        cls.layer_norm2(alloc257, model_decoder_layers_14_final_layer_norm_weight2, model_decoder_layers_14_final_layer_norm_bias2, alloc258)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias2)
        model_decoder_layers_14_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[843]
        model_decoder_layers_14_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[844]
        gv418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv418, R.dtype("float16"))
        _257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight2, alloc258, model_decoder_layers_14_fc1_bias2, alloc259)
        R.vm.kill_object(alloc258)
        R.vm.kill_object(model_decoder_layers_14_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_14_fc1_bias2)
        model_decoder_layers_14_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[845]
        model_decoder_layers_14_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[846]
        gv419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv419, R.dtype("float16"))
        _258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight2, alloc259, model_decoder_layers_14_fc2_bias2, alloc260)
        R.vm.kill_object(alloc259)
        R.vm.kill_object(model_decoder_layers_14_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_14_fc2_bias2)
        gv420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc261: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv420, R.dtype("float16"))
        cls.add5(alloc257, alloc260, alloc261)
        R.vm.kill_object(alloc257)
        R.vm.kill_object(alloc260)
        model_decoder_layers_15_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[856]
        model_decoder_layers_15_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[857]
        gv421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv421, R.dtype("float16"))
        cls.layer_norm2(alloc261, model_decoder_layers_15_self_attn_layer_norm_weight2, model_decoder_layers_15_self_attn_layer_norm_bias2, alloc262)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias2)
        model_decoder_layers_15_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[852]
        model_decoder_layers_15_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[853]
        gv422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv422, R.dtype("float16"))
        _261: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight2, alloc262, model_decoder_layers_15_self_attn_q_proj_bias2, alloc263)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias2)
        gv423: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape537: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc263, gv423, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc263)
        model_decoder_layers_15_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[849]
        gv424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv424, R.dtype("float16"))
        _262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight2, alloc262, alloc264)
        R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight2)
        gv425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape538: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc264, gv425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc264)
        model_decoder_layers_15_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[850]
        model_decoder_layers_15_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[851]
        gv426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv426, R.dtype("float16"))
        _263: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight2, alloc262, model_decoder_layers_15_self_attn_v_proj_bias2, alloc265)
        R.vm.kill_object(alloc262)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias2)
        gv427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape539: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc265, gv427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc265)
        gv428: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc266: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv428, R.dtype("float16"))
        cls.concatenate1(reshape537, reshape538, reshape539, alloc266)
        R.vm.kill_object(reshape537)
        R.vm.kill_object(reshape538)
        R.vm.kill_object(reshape539)
        gv429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape540: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc266, gv429, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc266)
        gv430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv430, R.dtype("float16"))
        _265: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape540, alloc267)
        R.vm.kill_object(reshape540)
        gv431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape541: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc267, gv431, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc267)
        gv432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape542: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape541, gv432, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape541)
        model_decoder_layers_15_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[854]
        model_decoder_layers_15_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[855]
        gv433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv433, R.dtype("float16"))
        _266: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight2, reshape542, model_decoder_layers_15_self_attn_out_proj_bias2, alloc268)
        R.vm.kill_object(reshape542)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias2)
        gv434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv434, R.dtype("float16"))
        cls.add5(alloc261, alloc268, alloc269)
        R.vm.kill_object(alloc261)
        R.vm.kill_object(alloc268)
        model_decoder_layers_15_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[865]
        model_decoder_layers_15_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[866]
        gv435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv435, R.dtype("float16"))
        cls.layer_norm2(alloc269, model_decoder_layers_15_encoder_attn_layer_norm_weight2, model_decoder_layers_15_encoder_attn_layer_norm_bias2, alloc270)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias2)
        model_decoder_layers_15_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[861]
        model_decoder_layers_15_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[862]
        gv436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv436, R.dtype("float16"))
        _269: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight2, alloc270, model_decoder_layers_15_encoder_attn_q_proj_bias2, alloc271)
        R.vm.kill_object(alloc270)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias2)
        gv437: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape543: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc271, gv437, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc271)
        gv438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape544: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape543, gv438, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape543)
        gv439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv439, R.dtype("float16"))
        _270: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape544, alloc272)
        R.vm.kill_object(reshape544)
        gv440: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape545: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc272, gv440, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc272)
        gv441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape546: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape545, gv441, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape545)
        model_decoder_layers_15_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[863]
        model_decoder_layers_15_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[864]
        gv442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv442, R.dtype("float16"))
        _271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight2, reshape546, model_decoder_layers_15_encoder_attn_out_proj_bias2, alloc273)
        R.vm.kill_object(reshape546)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias2)
        gv443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv443, R.dtype("float16"))
        cls.add5(alloc269, alloc273, alloc274)
        R.vm.kill_object(alloc269)
        R.vm.kill_object(alloc273)
        model_decoder_layers_15_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[871]
        model_decoder_layers_15_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[872]
        gv444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv444, R.dtype("float16"))
        cls.layer_norm2(alloc274, model_decoder_layers_15_final_layer_norm_weight2, model_decoder_layers_15_final_layer_norm_bias2, alloc275)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias2)
        model_decoder_layers_15_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[867]
        model_decoder_layers_15_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[868]
        gv445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv445, R.dtype("float16"))
        _274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight2, alloc275, model_decoder_layers_15_fc1_bias2, alloc276)
        R.vm.kill_object(alloc275)
        R.vm.kill_object(model_decoder_layers_15_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_15_fc1_bias2)
        model_decoder_layers_15_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[869]
        model_decoder_layers_15_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[870]
        gv446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv446, R.dtype("float16"))
        _275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight2, alloc276, model_decoder_layers_15_fc2_bias2, alloc277)
        R.vm.kill_object(alloc276)
        R.vm.kill_object(model_decoder_layers_15_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_15_fc2_bias2)
        gv447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc278: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv447, R.dtype("float16"))
        cls.add5(alloc274, alloc277, alloc278)
        R.vm.kill_object(alloc274)
        R.vm.kill_object(alloc277)
        model_decoder_layers_16_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[880]
        model_decoder_layers_16_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[881]
        gv448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv448, R.dtype("float16"))
        cls.layer_norm2(alloc278, model_decoder_layers_16_self_attn_layer_norm_weight2, model_decoder_layers_16_self_attn_layer_norm_bias2, alloc279)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias2)
        model_decoder_layers_16_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[876]
        model_decoder_layers_16_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[877]
        gv449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv449, R.dtype("float16"))
        _278: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight2, alloc279, model_decoder_layers_16_self_attn_q_proj_bias2, alloc280)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias2)
        gv450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape547: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc280, gv450, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc280)
        model_decoder_layers_16_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[873]
        gv451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv451, R.dtype("float16"))
        _279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight2, alloc279, alloc281)
        R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight2)
        gv452: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape548: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc281, gv452, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc281)
        model_decoder_layers_16_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[874]
        model_decoder_layers_16_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[875]
        gv453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv453, R.dtype("float16"))
        _280: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight2, alloc279, model_decoder_layers_16_self_attn_v_proj_bias2, alloc282)
        R.vm.kill_object(alloc279)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias2)
        gv454: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape549: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc282, gv454, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc282)
        gv455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc283: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv455, R.dtype("float16"))
        cls.concatenate1(reshape547, reshape548, reshape549, alloc283)
        R.vm.kill_object(reshape547)
        R.vm.kill_object(reshape548)
        R.vm.kill_object(reshape549)
        gv456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape550: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc283, gv456, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc283)
        gv457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv457, R.dtype("float16"))
        _282: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape550, alloc284)
        R.vm.kill_object(reshape550)
        gv458: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape551: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc284, gv458, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc284)
        gv459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape552: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape551, gv459, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape551)
        model_decoder_layers_16_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[878]
        model_decoder_layers_16_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[879]
        gv460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv460, R.dtype("float16"))
        _283: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight2, reshape552, model_decoder_layers_16_self_attn_out_proj_bias2, alloc285)
        R.vm.kill_object(reshape552)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias2)
        gv461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv461, R.dtype("float16"))
        cls.add5(alloc278, alloc285, alloc286)
        R.vm.kill_object(alloc278)
        R.vm.kill_object(alloc285)
        model_decoder_layers_16_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[889]
        model_decoder_layers_16_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[890]
        gv462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv462, R.dtype("float16"))
        cls.layer_norm2(alloc286, model_decoder_layers_16_encoder_attn_layer_norm_weight2, model_decoder_layers_16_encoder_attn_layer_norm_bias2, alloc287)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias2)
        model_decoder_layers_16_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[885]
        model_decoder_layers_16_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[886]
        gv463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv463, R.dtype("float16"))
        _286: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight2, alloc287, model_decoder_layers_16_encoder_attn_q_proj_bias2, alloc288)
        R.vm.kill_object(alloc287)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias2)
        gv464: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape553: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc288, gv464, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc288)
        gv465: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape554: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape553, gv465, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape553)
        gv466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv466, R.dtype("float16"))
        _287: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape554, alloc289)
        R.vm.kill_object(reshape554)
        gv467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape555: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc289, gv467, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc289)
        gv468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape556: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape555, gv468, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape555)
        model_decoder_layers_16_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[887]
        model_decoder_layers_16_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[888]
        gv469: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv469, R.dtype("float16"))
        _288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight2, reshape556, model_decoder_layers_16_encoder_attn_out_proj_bias2, alloc290)
        R.vm.kill_object(reshape556)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias2)
        gv470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv470, R.dtype("float16"))
        cls.add5(alloc286, alloc290, alloc291)
        R.vm.kill_object(alloc286)
        R.vm.kill_object(alloc290)
        model_decoder_layers_16_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[895]
        model_decoder_layers_16_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[896]
        gv471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv471, R.dtype("float16"))
        cls.layer_norm2(alloc291, model_decoder_layers_16_final_layer_norm_weight2, model_decoder_layers_16_final_layer_norm_bias2, alloc292)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias2)
        model_decoder_layers_16_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[891]
        model_decoder_layers_16_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[892]
        gv472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv472, R.dtype("float16"))
        _291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight2, alloc292, model_decoder_layers_16_fc1_bias2, alloc293)
        R.vm.kill_object(alloc292)
        R.vm.kill_object(model_decoder_layers_16_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_16_fc1_bias2)
        model_decoder_layers_16_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[893]
        model_decoder_layers_16_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[894]
        gv473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv473, R.dtype("float16"))
        _292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight2, alloc293, model_decoder_layers_16_fc2_bias2, alloc294)
        R.vm.kill_object(alloc293)
        R.vm.kill_object(model_decoder_layers_16_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_16_fc2_bias2)
        gv474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc295: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv474, R.dtype("float16"))
        cls.add5(alloc291, alloc294, alloc295)
        R.vm.kill_object(alloc291)
        R.vm.kill_object(alloc294)
        model_decoder_layers_17_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[904]
        model_decoder_layers_17_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[905]
        gv475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv475, R.dtype("float16"))
        cls.layer_norm2(alloc295, model_decoder_layers_17_self_attn_layer_norm_weight2, model_decoder_layers_17_self_attn_layer_norm_bias2, alloc296)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias2)
        model_decoder_layers_17_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[900]
        model_decoder_layers_17_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[901]
        gv476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv476, R.dtype("float16"))
        _295: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight2, alloc296, model_decoder_layers_17_self_attn_q_proj_bias2, alloc297)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias2)
        gv477: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape557: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc297, gv477, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc297)
        model_decoder_layers_17_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[897]
        gv478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv478, R.dtype("float16"))
        _296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight2, alloc296, alloc298)
        R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight2)
        gv479: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape558: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc298, gv479, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc298)
        model_decoder_layers_17_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[898]
        model_decoder_layers_17_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[899]
        gv480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv480, R.dtype("float16"))
        _297: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight2, alloc296, model_decoder_layers_17_self_attn_v_proj_bias2, alloc299)
        R.vm.kill_object(alloc296)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias2)
        gv481: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape559: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc299, gv481, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc299)
        gv482: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc300: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv482, R.dtype("float16"))
        cls.concatenate1(reshape557, reshape558, reshape559, alloc300)
        R.vm.kill_object(reshape557)
        R.vm.kill_object(reshape558)
        R.vm.kill_object(reshape559)
        gv483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape560: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc300, gv483, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc300)
        gv484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv484, R.dtype("float16"))
        _299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape560, alloc301)
        R.vm.kill_object(reshape560)
        gv485: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape561: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc301, gv485, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc301)
        gv486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape562: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape561, gv486, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape561)
        model_decoder_layers_17_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[902]
        model_decoder_layers_17_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[903]
        gv487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv487, R.dtype("float16"))
        _300: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight2, reshape562, model_decoder_layers_17_self_attn_out_proj_bias2, alloc302)
        R.vm.kill_object(reshape562)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias2)
        gv488: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv488, R.dtype("float16"))
        cls.add5(alloc295, alloc302, alloc303)
        R.vm.kill_object(alloc295)
        R.vm.kill_object(alloc302)
        model_decoder_layers_17_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[913]
        model_decoder_layers_17_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[914]
        gv489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv489, R.dtype("float16"))
        cls.layer_norm2(alloc303, model_decoder_layers_17_encoder_attn_layer_norm_weight2, model_decoder_layers_17_encoder_attn_layer_norm_bias2, alloc304)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias2)
        model_decoder_layers_17_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[909]
        model_decoder_layers_17_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[910]
        gv490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv490, R.dtype("float16"))
        _303: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight2, alloc304, model_decoder_layers_17_encoder_attn_q_proj_bias2, alloc305)
        R.vm.kill_object(alloc304)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias2)
        gv491: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape563: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc305, gv491, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc305)
        gv492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape564: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape563, gv492, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape563)
        gv493: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv493, R.dtype("float16"))
        _304: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape564, alloc306)
        R.vm.kill_object(reshape564)
        gv494: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape565: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc306, gv494, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc306)
        gv495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape566: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape565, gv495, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape565)
        model_decoder_layers_17_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[911]
        model_decoder_layers_17_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[912]
        gv496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv496, R.dtype("float16"))
        _305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight2, reshape566, model_decoder_layers_17_encoder_attn_out_proj_bias2, alloc307)
        R.vm.kill_object(reshape566)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias2)
        gv497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv497, R.dtype("float16"))
        cls.add5(alloc303, alloc307, alloc308)
        R.vm.kill_object(alloc303)
        R.vm.kill_object(alloc307)
        model_decoder_layers_17_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[919]
        model_decoder_layers_17_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[920]
        gv498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv498, R.dtype("float16"))
        cls.layer_norm2(alloc308, model_decoder_layers_17_final_layer_norm_weight2, model_decoder_layers_17_final_layer_norm_bias2, alloc309)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias2)
        model_decoder_layers_17_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[915]
        model_decoder_layers_17_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[916]
        gv499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv499, R.dtype("float16"))
        _308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight2, alloc309, model_decoder_layers_17_fc1_bias2, alloc310)
        R.vm.kill_object(alloc309)
        R.vm.kill_object(model_decoder_layers_17_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_17_fc1_bias2)
        model_decoder_layers_17_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[917]
        model_decoder_layers_17_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[918]
        gv500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv500, R.dtype("float16"))
        _309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight2, alloc310, model_decoder_layers_17_fc2_bias2, alloc311)
        R.vm.kill_object(alloc310)
        R.vm.kill_object(model_decoder_layers_17_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_17_fc2_bias2)
        gv501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc312: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv501, R.dtype("float16"))
        cls.add5(alloc308, alloc311, alloc312)
        R.vm.kill_object(alloc308)
        R.vm.kill_object(alloc311)
        model_decoder_layers_18_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[928]
        model_decoder_layers_18_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[929]
        gv502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv502, R.dtype("float16"))
        cls.layer_norm2(alloc312, model_decoder_layers_18_self_attn_layer_norm_weight2, model_decoder_layers_18_self_attn_layer_norm_bias2, alloc313)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias2)
        model_decoder_layers_18_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[924]
        model_decoder_layers_18_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[925]
        gv503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv503, R.dtype("float16"))
        _312: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight2, alloc313, model_decoder_layers_18_self_attn_q_proj_bias2, alloc314)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias2)
        gv504: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape567: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc314, gv504, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc314)
        model_decoder_layers_18_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[921]
        gv505: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv505, R.dtype("float16"))
        _313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight2, alloc313, alloc315)
        R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight2)
        gv506: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape568: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc315, gv506, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc315)
        model_decoder_layers_18_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[922]
        model_decoder_layers_18_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[923]
        gv507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv507, R.dtype("float16"))
        _314: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight2, alloc313, model_decoder_layers_18_self_attn_v_proj_bias2, alloc316)
        R.vm.kill_object(alloc313)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias2)
        gv508: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape569: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc316, gv508, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc316)
        gv509: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc317: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv509, R.dtype("float16"))
        cls.concatenate1(reshape567, reshape568, reshape569, alloc317)
        R.vm.kill_object(reshape567)
        R.vm.kill_object(reshape568)
        R.vm.kill_object(reshape569)
        gv510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape570: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc317, gv510, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc317)
        gv511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv511, R.dtype("float16"))
        _316: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape570, alloc318)
        R.vm.kill_object(reshape570)
        gv512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape571: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc318, gv512, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc318)
        gv513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape572: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape571, gv513, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape571)
        model_decoder_layers_18_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[926]
        model_decoder_layers_18_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[927]
        gv514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv514, R.dtype("float16"))
        _317: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight2, reshape572, model_decoder_layers_18_self_attn_out_proj_bias2, alloc319)
        R.vm.kill_object(reshape572)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias2)
        gv515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv515, R.dtype("float16"))
        cls.add5(alloc312, alloc319, alloc320)
        R.vm.kill_object(alloc312)
        R.vm.kill_object(alloc319)
        model_decoder_layers_18_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[937]
        model_decoder_layers_18_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[938]
        gv516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv516, R.dtype("float16"))
        cls.layer_norm2(alloc320, model_decoder_layers_18_encoder_attn_layer_norm_weight2, model_decoder_layers_18_encoder_attn_layer_norm_bias2, alloc321)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias2)
        model_decoder_layers_18_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[933]
        model_decoder_layers_18_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[934]
        gv517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv517, R.dtype("float16"))
        _320: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight2, alloc321, model_decoder_layers_18_encoder_attn_q_proj_bias2, alloc322)
        R.vm.kill_object(alloc321)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias2)
        gv518: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape573: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc322, gv518, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc322)
        gv519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape574: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape573, gv519, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape573)
        gv520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv520, R.dtype("float16"))
        _321: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape574, alloc323)
        R.vm.kill_object(reshape574)
        gv521: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape575: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc323, gv521, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc323)
        gv522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape576: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape575, gv522, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape575)
        model_decoder_layers_18_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[935]
        model_decoder_layers_18_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[936]
        gv523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv523, R.dtype("float16"))
        _322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight2, reshape576, model_decoder_layers_18_encoder_attn_out_proj_bias2, alloc324)
        R.vm.kill_object(reshape576)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias2)
        gv524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv524, R.dtype("float16"))
        cls.add5(alloc320, alloc324, alloc325)
        R.vm.kill_object(alloc320)
        R.vm.kill_object(alloc324)
        model_decoder_layers_18_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[943]
        model_decoder_layers_18_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[944]
        gv525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv525, R.dtype("float16"))
        cls.layer_norm2(alloc325, model_decoder_layers_18_final_layer_norm_weight2, model_decoder_layers_18_final_layer_norm_bias2, alloc326)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias2)
        model_decoder_layers_18_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[939]
        model_decoder_layers_18_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[940]
        gv526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv526, R.dtype("float16"))
        _325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight2, alloc326, model_decoder_layers_18_fc1_bias2, alloc327)
        R.vm.kill_object(alloc326)
        R.vm.kill_object(model_decoder_layers_18_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_18_fc1_bias2)
        model_decoder_layers_18_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[941]
        model_decoder_layers_18_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[942]
        gv527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv527, R.dtype("float16"))
        _326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight2, alloc327, model_decoder_layers_18_fc2_bias2, alloc328)
        R.vm.kill_object(alloc327)
        R.vm.kill_object(model_decoder_layers_18_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_18_fc2_bias2)
        gv528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc329: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv528, R.dtype("float16"))
        cls.add5(alloc325, alloc328, alloc329)
        R.vm.kill_object(alloc325)
        R.vm.kill_object(alloc328)
        model_decoder_layers_19_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[952]
        model_decoder_layers_19_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[953]
        gv529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv529, R.dtype("float16"))
        cls.layer_norm2(alloc329, model_decoder_layers_19_self_attn_layer_norm_weight2, model_decoder_layers_19_self_attn_layer_norm_bias2, alloc330)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias2)
        model_decoder_layers_19_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[948]
        model_decoder_layers_19_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[949]
        gv530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv530, R.dtype("float16"))
        _329: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight2, alloc330, model_decoder_layers_19_self_attn_q_proj_bias2, alloc331)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias2)
        gv531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape577: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc331, gv531, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc331)
        model_decoder_layers_19_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[945]
        gv532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv532, R.dtype("float16"))
        _330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight2, alloc330, alloc332)
        R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight2)
        gv533: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape578: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc332, gv533, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc332)
        model_decoder_layers_19_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[946]
        model_decoder_layers_19_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[947]
        gv534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv534, R.dtype("float16"))
        _331: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight2, alloc330, model_decoder_layers_19_self_attn_v_proj_bias2, alloc333)
        R.vm.kill_object(alloc330)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias2)
        gv535: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape579: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc333, gv535, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc333)
        gv536: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc334: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv536, R.dtype("float16"))
        cls.concatenate1(reshape577, reshape578, reshape579, alloc334)
        R.vm.kill_object(reshape577)
        R.vm.kill_object(reshape578)
        R.vm.kill_object(reshape579)
        gv537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape580: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc334, gv537, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc334)
        gv538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv538, R.dtype("float16"))
        _333: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape580, alloc335)
        R.vm.kill_object(reshape580)
        gv539: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape581: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc335, gv539, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc335)
        gv540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape582: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape581, gv540, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape581)
        model_decoder_layers_19_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[950]
        model_decoder_layers_19_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[951]
        gv541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv541, R.dtype("float16"))
        _334: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight2, reshape582, model_decoder_layers_19_self_attn_out_proj_bias2, alloc336)
        R.vm.kill_object(reshape582)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias2)
        gv542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv542, R.dtype("float16"))
        cls.add5(alloc329, alloc336, alloc337)
        R.vm.kill_object(alloc329)
        R.vm.kill_object(alloc336)
        model_decoder_layers_19_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[961]
        model_decoder_layers_19_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[962]
        gv543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv543, R.dtype("float16"))
        cls.layer_norm2(alloc337, model_decoder_layers_19_encoder_attn_layer_norm_weight2, model_decoder_layers_19_encoder_attn_layer_norm_bias2, alloc338)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias2)
        model_decoder_layers_19_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[957]
        model_decoder_layers_19_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[958]
        gv544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv544, R.dtype("float16"))
        _337: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight2, alloc338, model_decoder_layers_19_encoder_attn_q_proj_bias2, alloc339)
        R.vm.kill_object(alloc338)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias2)
        gv545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape583: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc339, gv545, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc339)
        gv546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape584: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape583, gv546, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape583)
        gv547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv547, R.dtype("float16"))
        _338: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape584, alloc340)
        R.vm.kill_object(reshape584)
        gv548: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape585: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc340, gv548, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc340)
        gv549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape586: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape585, gv549, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape585)
        model_decoder_layers_19_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[959]
        model_decoder_layers_19_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[960]
        gv550: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv550, R.dtype("float16"))
        _339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight2, reshape586, model_decoder_layers_19_encoder_attn_out_proj_bias2, alloc341)
        R.vm.kill_object(reshape586)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias2)
        gv551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv551, R.dtype("float16"))
        cls.add5(alloc337, alloc341, alloc342)
        R.vm.kill_object(alloc337)
        R.vm.kill_object(alloc341)
        model_decoder_layers_19_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[967]
        model_decoder_layers_19_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[968]
        gv552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv552, R.dtype("float16"))
        cls.layer_norm2(alloc342, model_decoder_layers_19_final_layer_norm_weight2, model_decoder_layers_19_final_layer_norm_bias2, alloc343)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias2)
        model_decoder_layers_19_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[963]
        model_decoder_layers_19_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[964]
        gv553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv553, R.dtype("float16"))
        _342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight2, alloc343, model_decoder_layers_19_fc1_bias2, alloc344)
        R.vm.kill_object(alloc343)
        R.vm.kill_object(model_decoder_layers_19_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_19_fc1_bias2)
        model_decoder_layers_19_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[965]
        model_decoder_layers_19_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[966]
        gv554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv554, R.dtype("float16"))
        _343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight2, alloc344, model_decoder_layers_19_fc2_bias2, alloc345)
        R.vm.kill_object(alloc344)
        R.vm.kill_object(model_decoder_layers_19_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_19_fc2_bias2)
        gv555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc346: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv555, R.dtype("float16"))
        cls.add5(alloc342, alloc345, alloc346)
        R.vm.kill_object(alloc342)
        R.vm.kill_object(alloc345)
        model_decoder_layers_20_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[976]
        model_decoder_layers_20_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[977]
        gv556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv556, R.dtype("float16"))
        cls.layer_norm2(alloc346, model_decoder_layers_20_self_attn_layer_norm_weight2, model_decoder_layers_20_self_attn_layer_norm_bias2, alloc347)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias2)
        model_decoder_layers_20_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[972]
        model_decoder_layers_20_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[973]
        gv557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv557, R.dtype("float16"))
        _346: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight2, alloc347, model_decoder_layers_20_self_attn_q_proj_bias2, alloc348)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias2)
        gv558: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape587: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc348, gv558, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc348)
        model_decoder_layers_20_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[969]
        gv559: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv559, R.dtype("float16"))
        _347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight2, alloc347, alloc349)
        R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight2)
        gv560: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape588: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc349, gv560, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc349)
        model_decoder_layers_20_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[970]
        model_decoder_layers_20_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[971]
        gv561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv561, R.dtype("float16"))
        _348: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight2, alloc347, model_decoder_layers_20_self_attn_v_proj_bias2, alloc350)
        R.vm.kill_object(alloc347)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias2)
        gv562: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape589: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc350, gv562, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc350)
        gv563: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc351: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv563, R.dtype("float16"))
        cls.concatenate1(reshape587, reshape588, reshape589, alloc351)
        R.vm.kill_object(reshape587)
        R.vm.kill_object(reshape588)
        R.vm.kill_object(reshape589)
        gv564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape590: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc351, gv564, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc351)
        gv565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv565, R.dtype("float16"))
        _350: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape590, alloc352)
        R.vm.kill_object(reshape590)
        gv566: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape591: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc352, gv566, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc352)
        gv567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape592: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape591, gv567, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape591)
        model_decoder_layers_20_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[974]
        model_decoder_layers_20_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[975]
        gv568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv568, R.dtype("float16"))
        _351: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight2, reshape592, model_decoder_layers_20_self_attn_out_proj_bias2, alloc353)
        R.vm.kill_object(reshape592)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias2)
        gv569: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv569, R.dtype("float16"))
        cls.add5(alloc346, alloc353, alloc354)
        R.vm.kill_object(alloc346)
        R.vm.kill_object(alloc353)
        model_decoder_layers_20_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[985]
        model_decoder_layers_20_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[986]
        gv570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv570, R.dtype("float16"))
        cls.layer_norm2(alloc354, model_decoder_layers_20_encoder_attn_layer_norm_weight2, model_decoder_layers_20_encoder_attn_layer_norm_bias2, alloc355)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias2)
        model_decoder_layers_20_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[981]
        model_decoder_layers_20_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[982]
        gv571: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv571, R.dtype("float16"))
        _354: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight2, alloc355, model_decoder_layers_20_encoder_attn_q_proj_bias2, alloc356)
        R.vm.kill_object(alloc355)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias2)
        gv572: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape593: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc356, gv572, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc356)
        gv573: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape594: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape593, gv573, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape593)
        gv574: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv574, R.dtype("float16"))
        _355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape594, alloc357)
        R.vm.kill_object(reshape594)
        gv575: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape595: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc357, gv575, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc357)
        gv576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape596: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape595, gv576, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape595)
        model_decoder_layers_20_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[983]
        model_decoder_layers_20_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[984]
        gv577: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv577, R.dtype("float16"))
        _356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight2, reshape596, model_decoder_layers_20_encoder_attn_out_proj_bias2, alloc358)
        R.vm.kill_object(reshape596)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias2)
        gv578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv578, R.dtype("float16"))
        cls.add5(alloc354, alloc358, alloc359)
        R.vm.kill_object(alloc354)
        R.vm.kill_object(alloc358)
        model_decoder_layers_20_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[991]
        model_decoder_layers_20_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[992]
        gv579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv579, R.dtype("float16"))
        cls.layer_norm2(alloc359, model_decoder_layers_20_final_layer_norm_weight2, model_decoder_layers_20_final_layer_norm_bias2, alloc360)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias2)
        model_decoder_layers_20_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[987]
        model_decoder_layers_20_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[988]
        gv580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv580, R.dtype("float16"))
        _359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight2, alloc360, model_decoder_layers_20_fc1_bias2, alloc361)
        R.vm.kill_object(alloc360)
        R.vm.kill_object(model_decoder_layers_20_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_20_fc1_bias2)
        model_decoder_layers_20_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[989]
        model_decoder_layers_20_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[990]
        gv581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv581, R.dtype("float16"))
        _360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight2, alloc361, model_decoder_layers_20_fc2_bias2, alloc362)
        R.vm.kill_object(alloc361)
        R.vm.kill_object(model_decoder_layers_20_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_20_fc2_bias2)
        gv582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc363: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv582, R.dtype("float16"))
        cls.add5(alloc359, alloc362, alloc363)
        R.vm.kill_object(alloc359)
        R.vm.kill_object(alloc362)
        model_decoder_layers_21_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1000]
        model_decoder_layers_21_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1001]
        gv583: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv583, R.dtype("float16"))
        cls.layer_norm2(alloc363, model_decoder_layers_21_self_attn_layer_norm_weight2, model_decoder_layers_21_self_attn_layer_norm_bias2, alloc364)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias2)
        model_decoder_layers_21_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[996]
        model_decoder_layers_21_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[997]
        gv584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv584, R.dtype("float16"))
        _363: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight2, alloc364, model_decoder_layers_21_self_attn_q_proj_bias2, alloc365)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias2)
        gv585: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape597: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc365, gv585, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc365)
        model_decoder_layers_21_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[993]
        gv586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv586, R.dtype("float16"))
        _364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight2, alloc364, alloc366)
        R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight2)
        gv587: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape598: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc366, gv587, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc366)
        model_decoder_layers_21_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[994]
        model_decoder_layers_21_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[995]
        gv588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv588, R.dtype("float16"))
        _365: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight2, alloc364, model_decoder_layers_21_self_attn_v_proj_bias2, alloc367)
        R.vm.kill_object(alloc364)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias2)
        gv589: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape599: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc367, gv589, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc367)
        gv590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc368: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv590, R.dtype("float16"))
        cls.concatenate1(reshape597, reshape598, reshape599, alloc368)
        R.vm.kill_object(reshape597)
        R.vm.kill_object(reshape598)
        R.vm.kill_object(reshape599)
        gv591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape600: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc368, gv591, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc368)
        gv592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv592, R.dtype("float16"))
        _367: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape600, alloc369)
        R.vm.kill_object(reshape600)
        gv593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape601: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc369, gv593, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc369)
        gv594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape602: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape601, gv594, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape601)
        model_decoder_layers_21_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[998]
        model_decoder_layers_21_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[999]
        gv595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv595, R.dtype("float16"))
        _368: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight2, reshape602, model_decoder_layers_21_self_attn_out_proj_bias2, alloc370)
        R.vm.kill_object(reshape602)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias2)
        gv596: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv596, R.dtype("float16"))
        cls.add5(alloc363, alloc370, alloc371)
        R.vm.kill_object(alloc363)
        R.vm.kill_object(alloc370)
        model_decoder_layers_21_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1009]
        model_decoder_layers_21_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1010]
        gv597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv597, R.dtype("float16"))
        cls.layer_norm2(alloc371, model_decoder_layers_21_encoder_attn_layer_norm_weight2, model_decoder_layers_21_encoder_attn_layer_norm_bias2, alloc372)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias2)
        model_decoder_layers_21_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005]
        model_decoder_layers_21_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1006]
        gv598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv598, R.dtype("float16"))
        _371: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight2, alloc372, model_decoder_layers_21_encoder_attn_q_proj_bias2, alloc373)
        R.vm.kill_object(alloc372)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias2)
        gv599: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape603: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc373, gv599, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc373)
        gv600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape604: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape603, gv600, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape603)
        gv601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv601, R.dtype("float16"))
        _372: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape604, alloc374)
        R.vm.kill_object(reshape604)
        gv602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape605: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc374, gv602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc374)
        gv603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape606: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape605, gv603, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape605)
        model_decoder_layers_21_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007]
        model_decoder_layers_21_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1008]
        gv604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv604, R.dtype("float16"))
        _373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight2, reshape606, model_decoder_layers_21_encoder_attn_out_proj_bias2, alloc375)
        R.vm.kill_object(reshape606)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias2)
        gv605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv605, R.dtype("float16"))
        cls.add5(alloc371, alloc375, alloc376)
        R.vm.kill_object(alloc371)
        R.vm.kill_object(alloc375)
        model_decoder_layers_21_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1015]
        model_decoder_layers_21_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1016]
        gv606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv606, R.dtype("float16"))
        cls.layer_norm2(alloc376, model_decoder_layers_21_final_layer_norm_weight2, model_decoder_layers_21_final_layer_norm_bias2, alloc377)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias2)
        model_decoder_layers_21_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011]
        model_decoder_layers_21_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1012]
        gv607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv607, R.dtype("float16"))
        _376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight2, alloc377, model_decoder_layers_21_fc1_bias2, alloc378)
        R.vm.kill_object(alloc377)
        R.vm.kill_object(model_decoder_layers_21_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_21_fc1_bias2)
        model_decoder_layers_21_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013]
        model_decoder_layers_21_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1014]
        gv608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv608, R.dtype("float16"))
        _377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight2, alloc378, model_decoder_layers_21_fc2_bias2, alloc379)
        R.vm.kill_object(alloc378)
        R.vm.kill_object(model_decoder_layers_21_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_21_fc2_bias2)
        gv609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc380: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv609, R.dtype("float16"))
        cls.add5(alloc376, alloc379, alloc380)
        R.vm.kill_object(alloc376)
        R.vm.kill_object(alloc379)
        model_decoder_layers_22_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1024]
        model_decoder_layers_22_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1025]
        gv610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv610, R.dtype("float16"))
        cls.layer_norm2(alloc380, model_decoder_layers_22_self_attn_layer_norm_weight2, model_decoder_layers_22_self_attn_layer_norm_bias2, alloc381)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias2)
        model_decoder_layers_22_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020]
        model_decoder_layers_22_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1021]
        gv611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv611, R.dtype("float16"))
        _380: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight2, alloc381, model_decoder_layers_22_self_attn_q_proj_bias2, alloc382)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias2)
        gv612: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape607: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc382, gv612, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc382)
        model_decoder_layers_22_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017]
        gv613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv613, R.dtype("float16"))
        _381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight2, alloc381, alloc383)
        R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight2)
        gv614: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape608: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc383, gv614, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc383)
        model_decoder_layers_22_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018]
        model_decoder_layers_22_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1019]
        gv615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv615, R.dtype("float16"))
        _382: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight2, alloc381, model_decoder_layers_22_self_attn_v_proj_bias2, alloc384)
        R.vm.kill_object(alloc381)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias2)
        gv616: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape609: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc384, gv616, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc384)
        gv617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc385: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv617, R.dtype("float16"))
        cls.concatenate1(reshape607, reshape608, reshape609, alloc385)
        R.vm.kill_object(reshape607)
        R.vm.kill_object(reshape608)
        R.vm.kill_object(reshape609)
        gv618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape610: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc385, gv618, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc385)
        gv619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv619, R.dtype("float16"))
        _384: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape610, alloc386)
        R.vm.kill_object(reshape610)
        gv620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape611: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc386, gv620, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc386)
        gv621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape612: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape611, gv621, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape611)
        model_decoder_layers_22_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022]
        model_decoder_layers_22_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1023]
        gv622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv622, R.dtype("float16"))
        _385: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight2, reshape612, model_decoder_layers_22_self_attn_out_proj_bias2, alloc387)
        R.vm.kill_object(reshape612)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias2)
        gv623: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv623, R.dtype("float16"))
        cls.add5(alloc380, alloc387, alloc388)
        R.vm.kill_object(alloc380)
        R.vm.kill_object(alloc387)
        model_decoder_layers_22_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1033]
        model_decoder_layers_22_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1034]
        gv624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv624, R.dtype("float16"))
        cls.layer_norm2(alloc388, model_decoder_layers_22_encoder_attn_layer_norm_weight2, model_decoder_layers_22_encoder_attn_layer_norm_bias2, alloc389)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias2)
        model_decoder_layers_22_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029]
        model_decoder_layers_22_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1030]
        gv625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv625, R.dtype("float16"))
        _388: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight2, alloc389, model_decoder_layers_22_encoder_attn_q_proj_bias2, alloc390)
        R.vm.kill_object(alloc389)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias2)
        gv626: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape613: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc390, gv626, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc390)
        gv627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape614: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape613, gv627, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape613)
        gv628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv628, R.dtype("float16"))
        _389: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape614, alloc391)
        R.vm.kill_object(reshape614)
        gv629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape615: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc391, gv629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc391)
        gv630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape616: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape615, gv630, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape615)
        model_decoder_layers_22_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031]
        model_decoder_layers_22_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1032]
        gv631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv631, R.dtype("float16"))
        _390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight2, reshape616, model_decoder_layers_22_encoder_attn_out_proj_bias2, alloc392)
        R.vm.kill_object(reshape616)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias2)
        gv632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv632, R.dtype("float16"))
        cls.add5(alloc388, alloc392, alloc393)
        R.vm.kill_object(alloc388)
        R.vm.kill_object(alloc392)
        model_decoder_layers_22_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1039]
        model_decoder_layers_22_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1040]
        gv633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv633, R.dtype("float16"))
        cls.layer_norm2(alloc393, model_decoder_layers_22_final_layer_norm_weight2, model_decoder_layers_22_final_layer_norm_bias2, alloc394)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias2)
        model_decoder_layers_22_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035]
        model_decoder_layers_22_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1036]
        gv634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv634, R.dtype("float16"))
        _393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight2, alloc394, model_decoder_layers_22_fc1_bias2, alloc395)
        R.vm.kill_object(alloc394)
        R.vm.kill_object(model_decoder_layers_22_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_22_fc1_bias2)
        model_decoder_layers_22_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037]
        model_decoder_layers_22_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1038]
        gv635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv635, R.dtype("float16"))
        _394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight2, alloc395, model_decoder_layers_22_fc2_bias2, alloc396)
        R.vm.kill_object(alloc395)
        R.vm.kill_object(model_decoder_layers_22_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_22_fc2_bias2)
        gv636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc397: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv636, R.dtype("float16"))
        cls.add5(alloc393, alloc396, alloc397)
        R.vm.kill_object(alloc393)
        R.vm.kill_object(alloc396)
        model_decoder_layers_23_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1048]
        model_decoder_layers_23_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1049]
        gv637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv637, R.dtype("float16"))
        cls.layer_norm2(alloc397, model_decoder_layers_23_self_attn_layer_norm_weight2, model_decoder_layers_23_self_attn_layer_norm_bias2, alloc398)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias2)
        model_decoder_layers_23_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044]
        model_decoder_layers_23_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1045]
        gv638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv638, R.dtype("float16"))
        _397: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight2, alloc398, model_decoder_layers_23_self_attn_q_proj_bias2, alloc399)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias2)
        gv639: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape617: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc399, gv639, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc399)
        model_decoder_layers_23_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041]
        gv640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv640, R.dtype("float16"))
        _398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight2, alloc398, alloc400)
        R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight2)
        gv641: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape618: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc400, gv641, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc400)
        model_decoder_layers_23_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042]
        model_decoder_layers_23_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1043]
        gv642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv642, R.dtype("float16"))
        _399: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight2, alloc398, model_decoder_layers_23_self_attn_v_proj_bias2, alloc401)
        R.vm.kill_object(alloc398)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias2)
        gv643: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape619: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc401, gv643, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc401)
        gv644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc402: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv644, R.dtype("float16"))
        cls.concatenate1(reshape617, reshape618, reshape619, alloc402)
        R.vm.kill_object(reshape617)
        R.vm.kill_object(reshape618)
        R.vm.kill_object(reshape619)
        gv645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape620: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc402, gv645, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc402)
        gv646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv646, R.dtype("float16"))
        _401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape620, alloc403)
        R.vm.kill_object(reshape620)
        gv647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape621: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc403, gv647, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc403)
        gv648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape622: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape621, gv648, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape621)
        model_decoder_layers_23_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046]
        model_decoder_layers_23_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1047]
        gv649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv649, R.dtype("float16"))
        _402: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight2, reshape622, model_decoder_layers_23_self_attn_out_proj_bias2, alloc404)
        R.vm.kill_object(reshape622)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias2)
        gv650: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv650, R.dtype("float16"))
        cls.add5(alloc397, alloc404, alloc405)
        R.vm.kill_object(alloc397)
        R.vm.kill_object(alloc404)
        model_decoder_layers_23_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1057]
        model_decoder_layers_23_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1058]
        gv651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv651, R.dtype("float16"))
        cls.layer_norm2(alloc405, model_decoder_layers_23_encoder_attn_layer_norm_weight2, model_decoder_layers_23_encoder_attn_layer_norm_bias2, alloc406)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias2)
        model_decoder_layers_23_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053]
        model_decoder_layers_23_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1054]
        gv652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv652, R.dtype("float16"))
        _405: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight2, alloc406, model_decoder_layers_23_encoder_attn_q_proj_bias2, alloc407)
        R.vm.kill_object(alloc406)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias2)
        gv653: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape623: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc407, gv653, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc407)
        gv654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape624: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape623, gv654, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape623)
        gv655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv655, R.dtype("float16"))
        _406: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape624, alloc408)
        R.vm.kill_object(reshape624)
        gv656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape625: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc408, gv656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc408)
        gv657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape626: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape625, gv657, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape625)
        model_decoder_layers_23_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055]
        model_decoder_layers_23_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1056]
        gv658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv658, R.dtype("float16"))
        _407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight2, reshape626, model_decoder_layers_23_encoder_attn_out_proj_bias2, alloc409)
        R.vm.kill_object(reshape626)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias2)
        gv659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv659, R.dtype("float16"))
        cls.add5(alloc405, alloc409, alloc410)
        R.vm.kill_object(alloc405)
        R.vm.kill_object(alloc409)
        model_decoder_layers_23_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1063]
        model_decoder_layers_23_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1064]
        gv660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv660, R.dtype("float16"))
        cls.layer_norm2(alloc410, model_decoder_layers_23_final_layer_norm_weight2, model_decoder_layers_23_final_layer_norm_bias2, alloc411)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias2)
        model_decoder_layers_23_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059]
        model_decoder_layers_23_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1060]
        gv661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv661, R.dtype("float16"))
        _410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight2, alloc411, model_decoder_layers_23_fc1_bias2, alloc412)
        R.vm.kill_object(alloc411)
        R.vm.kill_object(model_decoder_layers_23_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_23_fc1_bias2)
        model_decoder_layers_23_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061]
        model_decoder_layers_23_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1062]
        gv662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv662, R.dtype("float16"))
        _411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight2, alloc412, model_decoder_layers_23_fc2_bias2, alloc413)
        R.vm.kill_object(alloc412)
        R.vm.kill_object(model_decoder_layers_23_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_23_fc2_bias2)
        gv663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc414: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv663, R.dtype("float16"))
        cls.add5(alloc410, alloc413, alloc414)
        R.vm.kill_object(alloc410)
        R.vm.kill_object(alloc413)
        model_decoder_layers_24_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1072]
        model_decoder_layers_24_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1073]
        gv664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv664, R.dtype("float16"))
        cls.layer_norm2(alloc414, model_decoder_layers_24_self_attn_layer_norm_weight2, model_decoder_layers_24_self_attn_layer_norm_bias2, alloc415)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias2)
        model_decoder_layers_24_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068]
        model_decoder_layers_24_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1069]
        gv665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv665, R.dtype("float16"))
        _414: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight2, alloc415, model_decoder_layers_24_self_attn_q_proj_bias2, alloc416)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias2)
        gv666: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape627: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc416, gv666, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc416)
        model_decoder_layers_24_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065]
        gv667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv667, R.dtype("float16"))
        _415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight2, alloc415, alloc417)
        R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight2)
        gv668: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape628: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc417, gv668, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc417)
        model_decoder_layers_24_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066]
        model_decoder_layers_24_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1067]
        gv669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv669, R.dtype("float16"))
        _416: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight2, alloc415, model_decoder_layers_24_self_attn_v_proj_bias2, alloc418)
        R.vm.kill_object(alloc415)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias2)
        gv670: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape629: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc418, gv670, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc418)
        gv671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc419: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv671, R.dtype("float16"))
        cls.concatenate1(reshape627, reshape628, reshape629, alloc419)
        R.vm.kill_object(reshape627)
        R.vm.kill_object(reshape628)
        R.vm.kill_object(reshape629)
        gv672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape630: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc419, gv672, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc419)
        gv673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv673, R.dtype("float16"))
        _418: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape630, alloc420)
        R.vm.kill_object(reshape630)
        gv674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape631: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc420, gv674, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc420)
        gv675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape632: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape631, gv675, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape631)
        model_decoder_layers_24_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070]
        model_decoder_layers_24_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1071]
        gv676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv676, R.dtype("float16"))
        _419: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight2, reshape632, model_decoder_layers_24_self_attn_out_proj_bias2, alloc421)
        R.vm.kill_object(reshape632)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias2)
        gv677: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv677, R.dtype("float16"))
        cls.add5(alloc414, alloc421, alloc422)
        R.vm.kill_object(alloc414)
        R.vm.kill_object(alloc421)
        model_decoder_layers_24_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1081]
        model_decoder_layers_24_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1082]
        gv678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv678, R.dtype("float16"))
        cls.layer_norm2(alloc422, model_decoder_layers_24_encoder_attn_layer_norm_weight2, model_decoder_layers_24_encoder_attn_layer_norm_bias2, alloc423)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias2)
        model_decoder_layers_24_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077]
        model_decoder_layers_24_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1078]
        gv679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv679, R.dtype("float16"))
        _422: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight2, alloc423, model_decoder_layers_24_encoder_attn_q_proj_bias2, alloc424)
        R.vm.kill_object(alloc423)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias2)
        gv680: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape633: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc424, gv680, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc424)
        gv681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape634: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape633, gv681, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape633)
        gv682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv682, R.dtype("float16"))
        _423: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape634, alloc425)
        R.vm.kill_object(reshape634)
        gv683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape635: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc425, gv683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc425)
        gv684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape636: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape635, gv684, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape635)
        model_decoder_layers_24_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079]
        model_decoder_layers_24_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1080]
        gv685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv685, R.dtype("float16"))
        _424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight2, reshape636, model_decoder_layers_24_encoder_attn_out_proj_bias2, alloc426)
        R.vm.kill_object(reshape636)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias2)
        gv686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv686, R.dtype("float16"))
        cls.add5(alloc422, alloc426, alloc427)
        R.vm.kill_object(alloc422)
        R.vm.kill_object(alloc426)
        model_decoder_layers_24_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1087]
        model_decoder_layers_24_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1088]
        gv687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv687, R.dtype("float16"))
        cls.layer_norm2(alloc427, model_decoder_layers_24_final_layer_norm_weight2, model_decoder_layers_24_final_layer_norm_bias2, alloc428)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias2)
        model_decoder_layers_24_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083]
        model_decoder_layers_24_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1084]
        gv688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv688, R.dtype("float16"))
        _427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight2, alloc428, model_decoder_layers_24_fc1_bias2, alloc429)
        R.vm.kill_object(alloc428)
        R.vm.kill_object(model_decoder_layers_24_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_24_fc1_bias2)
        model_decoder_layers_24_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085]
        model_decoder_layers_24_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1086]
        gv689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv689, R.dtype("float16"))
        _428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight2, alloc429, model_decoder_layers_24_fc2_bias2, alloc430)
        R.vm.kill_object(alloc429)
        R.vm.kill_object(model_decoder_layers_24_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_24_fc2_bias2)
        gv690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc431: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv690, R.dtype("float16"))
        cls.add5(alloc427, alloc430, alloc431)
        R.vm.kill_object(alloc427)
        R.vm.kill_object(alloc430)
        model_decoder_layers_25_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1096]
        model_decoder_layers_25_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1097]
        gv691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv691, R.dtype("float16"))
        cls.layer_norm2(alloc431, model_decoder_layers_25_self_attn_layer_norm_weight2, model_decoder_layers_25_self_attn_layer_norm_bias2, alloc432)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias2)
        model_decoder_layers_25_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092]
        model_decoder_layers_25_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1093]
        gv692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv692, R.dtype("float16"))
        _431: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight2, alloc432, model_decoder_layers_25_self_attn_q_proj_bias2, alloc433)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias2)
        gv693: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape637: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc433, gv693, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc433)
        model_decoder_layers_25_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089]
        gv694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv694, R.dtype("float16"))
        _432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight2, alloc432, alloc434)
        R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight2)
        gv695: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape638: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc434, gv695, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc434)
        model_decoder_layers_25_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090]
        model_decoder_layers_25_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1091]
        gv696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv696, R.dtype("float16"))
        _433: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight2, alloc432, model_decoder_layers_25_self_attn_v_proj_bias2, alloc435)
        R.vm.kill_object(alloc432)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias2)
        gv697: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape639: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc435, gv697, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc435)
        gv698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc436: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv698, R.dtype("float16"))
        cls.concatenate1(reshape637, reshape638, reshape639, alloc436)
        R.vm.kill_object(reshape637)
        R.vm.kill_object(reshape638)
        R.vm.kill_object(reshape639)
        gv699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape640: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc436, gv699, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc436)
        gv700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv700, R.dtype("float16"))
        _435: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape640, alloc437)
        R.vm.kill_object(reshape640)
        gv701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape641: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc437, gv701, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc437)
        gv702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape642: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape641, gv702, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape641)
        model_decoder_layers_25_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094]
        model_decoder_layers_25_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1095]
        gv703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv703, R.dtype("float16"))
        _436: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight2, reshape642, model_decoder_layers_25_self_attn_out_proj_bias2, alloc438)
        R.vm.kill_object(reshape642)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias2)
        gv704: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv704, R.dtype("float16"))
        cls.add5(alloc431, alloc438, alloc439)
        R.vm.kill_object(alloc431)
        R.vm.kill_object(alloc438)
        model_decoder_layers_25_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1105]
        model_decoder_layers_25_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1106]
        gv705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv705, R.dtype("float16"))
        cls.layer_norm2(alloc439, model_decoder_layers_25_encoder_attn_layer_norm_weight2, model_decoder_layers_25_encoder_attn_layer_norm_bias2, alloc440)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias2)
        model_decoder_layers_25_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101]
        model_decoder_layers_25_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1102]
        gv706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv706, R.dtype("float16"))
        _439: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight2, alloc440, model_decoder_layers_25_encoder_attn_q_proj_bias2, alloc441)
        R.vm.kill_object(alloc440)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias2)
        gv707: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape643: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc441, gv707, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc441)
        gv708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape644: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape643, gv708, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape643)
        gv709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv709, R.dtype("float16"))
        _440: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape644, alloc442)
        R.vm.kill_object(reshape644)
        gv710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape645: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc442, gv710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc442)
        gv711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape646: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape645, gv711, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape645)
        model_decoder_layers_25_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103]
        model_decoder_layers_25_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1104]
        gv712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv712, R.dtype("float16"))
        _441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight2, reshape646, model_decoder_layers_25_encoder_attn_out_proj_bias2, alloc443)
        R.vm.kill_object(reshape646)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias2)
        gv713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv713, R.dtype("float16"))
        cls.add5(alloc439, alloc443, alloc444)
        R.vm.kill_object(alloc439)
        R.vm.kill_object(alloc443)
        model_decoder_layers_25_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1111]
        model_decoder_layers_25_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1112]
        gv714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv714, R.dtype("float16"))
        cls.layer_norm2(alloc444, model_decoder_layers_25_final_layer_norm_weight2, model_decoder_layers_25_final_layer_norm_bias2, alloc445)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias2)
        model_decoder_layers_25_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107]
        model_decoder_layers_25_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1108]
        gv715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv715, R.dtype("float16"))
        _444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight2, alloc445, model_decoder_layers_25_fc1_bias2, alloc446)
        R.vm.kill_object(alloc445)
        R.vm.kill_object(model_decoder_layers_25_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_25_fc1_bias2)
        model_decoder_layers_25_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109]
        model_decoder_layers_25_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1110]
        gv716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv716, R.dtype("float16"))
        _445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight2, alloc446, model_decoder_layers_25_fc2_bias2, alloc447)
        R.vm.kill_object(alloc446)
        R.vm.kill_object(model_decoder_layers_25_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_25_fc2_bias2)
        gv717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc448: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv717, R.dtype("float16"))
        cls.add5(alloc444, alloc447, alloc448)
        R.vm.kill_object(alloc444)
        R.vm.kill_object(alloc447)
        model_decoder_layers_26_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1120]
        model_decoder_layers_26_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1121]
        gv718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv718, R.dtype("float16"))
        cls.layer_norm2(alloc448, model_decoder_layers_26_self_attn_layer_norm_weight2, model_decoder_layers_26_self_attn_layer_norm_bias2, alloc449)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias2)
        model_decoder_layers_26_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116]
        model_decoder_layers_26_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1117]
        gv719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv719, R.dtype("float16"))
        _448: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight2, alloc449, model_decoder_layers_26_self_attn_q_proj_bias2, alloc450)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias2)
        gv720: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape647: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc450, gv720, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc450)
        model_decoder_layers_26_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113]
        gv721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv721, R.dtype("float16"))
        _449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight2, alloc449, alloc451)
        R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight2)
        gv722: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape648: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc451, gv722, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc451)
        model_decoder_layers_26_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114]
        model_decoder_layers_26_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1115]
        gv723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv723, R.dtype("float16"))
        _450: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight2, alloc449, model_decoder_layers_26_self_attn_v_proj_bias2, alloc452)
        R.vm.kill_object(alloc449)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias2)
        gv724: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape649: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc452, gv724, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc452)
        gv725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc453: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv725, R.dtype("float16"))
        cls.concatenate1(reshape647, reshape648, reshape649, alloc453)
        R.vm.kill_object(reshape647)
        R.vm.kill_object(reshape648)
        R.vm.kill_object(reshape649)
        gv726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape650: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc453, gv726, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc453)
        gv727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv727, R.dtype("float16"))
        _452: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape650, alloc454)
        R.vm.kill_object(reshape650)
        gv728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape651: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc454, gv728, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc454)
        gv729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape652: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape651, gv729, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape651)
        model_decoder_layers_26_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118]
        model_decoder_layers_26_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1119]
        gv730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv730, R.dtype("float16"))
        _453: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight2, reshape652, model_decoder_layers_26_self_attn_out_proj_bias2, alloc455)
        R.vm.kill_object(reshape652)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias2)
        gv731: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv731, R.dtype("float16"))
        cls.add5(alloc448, alloc455, alloc456)
        R.vm.kill_object(alloc448)
        R.vm.kill_object(alloc455)
        model_decoder_layers_26_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1129]
        model_decoder_layers_26_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1130]
        gv732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv732, R.dtype("float16"))
        cls.layer_norm2(alloc456, model_decoder_layers_26_encoder_attn_layer_norm_weight2, model_decoder_layers_26_encoder_attn_layer_norm_bias2, alloc457)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias2)
        model_decoder_layers_26_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125]
        model_decoder_layers_26_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1126]
        gv733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv733, R.dtype("float16"))
        _456: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight2, alloc457, model_decoder_layers_26_encoder_attn_q_proj_bias2, alloc458)
        R.vm.kill_object(alloc457)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias2)
        gv734: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape653: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc458, gv734, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc458)
        gv735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape654: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape653, gv735, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape653)
        gv736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv736, R.dtype("float16"))
        _457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape654, alloc459)
        R.vm.kill_object(reshape654)
        gv737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape655: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc459, gv737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc459)
        gv738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape656: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape655, gv738, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape655)
        model_decoder_layers_26_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127]
        model_decoder_layers_26_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1128]
        gv739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv739, R.dtype("float16"))
        _458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight2, reshape656, model_decoder_layers_26_encoder_attn_out_proj_bias2, alloc460)
        R.vm.kill_object(reshape656)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias2)
        gv740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv740, R.dtype("float16"))
        cls.add5(alloc456, alloc460, alloc461)
        R.vm.kill_object(alloc456)
        R.vm.kill_object(alloc460)
        model_decoder_layers_26_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1135]
        model_decoder_layers_26_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1136]
        gv741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv741, R.dtype("float16"))
        cls.layer_norm2(alloc461, model_decoder_layers_26_final_layer_norm_weight2, model_decoder_layers_26_final_layer_norm_bias2, alloc462)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias2)
        model_decoder_layers_26_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131]
        model_decoder_layers_26_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1132]
        gv742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv742, R.dtype("float16"))
        _461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight2, alloc462, model_decoder_layers_26_fc1_bias2, alloc463)
        R.vm.kill_object(alloc462)
        R.vm.kill_object(model_decoder_layers_26_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_26_fc1_bias2)
        model_decoder_layers_26_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133]
        model_decoder_layers_26_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1134]
        gv743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv743, R.dtype("float16"))
        _462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight2, alloc463, model_decoder_layers_26_fc2_bias2, alloc464)
        R.vm.kill_object(alloc463)
        R.vm.kill_object(model_decoder_layers_26_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_26_fc2_bias2)
        gv744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc465: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv744, R.dtype("float16"))
        cls.add5(alloc461, alloc464, alloc465)
        R.vm.kill_object(alloc461)
        R.vm.kill_object(alloc464)
        model_decoder_layers_27_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1144]
        model_decoder_layers_27_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1145]
        gv745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv745, R.dtype("float16"))
        cls.layer_norm2(alloc465, model_decoder_layers_27_self_attn_layer_norm_weight2, model_decoder_layers_27_self_attn_layer_norm_bias2, alloc466)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias2)
        model_decoder_layers_27_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140]
        model_decoder_layers_27_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1141]
        gv746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv746, R.dtype("float16"))
        _465: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight2, alloc466, model_decoder_layers_27_self_attn_q_proj_bias2, alloc467)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias2)
        gv747: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape657: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc467, gv747, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc467)
        model_decoder_layers_27_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137]
        gv748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv748, R.dtype("float16"))
        _466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight2, alloc466, alloc468)
        R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight2)
        gv749: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape658: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc468, gv749, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc468)
        model_decoder_layers_27_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138]
        model_decoder_layers_27_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1139]
        gv750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv750, R.dtype("float16"))
        _467: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight2, alloc466, model_decoder_layers_27_self_attn_v_proj_bias2, alloc469)
        R.vm.kill_object(alloc466)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias2)
        gv751: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape659: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc469, gv751, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc469)
        gv752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc470: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv752, R.dtype("float16"))
        cls.concatenate1(reshape657, reshape658, reshape659, alloc470)
        R.vm.kill_object(reshape657)
        R.vm.kill_object(reshape658)
        R.vm.kill_object(reshape659)
        gv753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape660: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc470, gv753, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc470)
        gv754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv754, R.dtype("float16"))
        _469: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape660, alloc471)
        R.vm.kill_object(reshape660)
        gv755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape661: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc471, gv755, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc471)
        gv756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape662: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape661, gv756, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape661)
        model_decoder_layers_27_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142]
        model_decoder_layers_27_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1143]
        gv757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv757, R.dtype("float16"))
        _470: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight2, reshape662, model_decoder_layers_27_self_attn_out_proj_bias2, alloc472)
        R.vm.kill_object(reshape662)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias2)
        gv758: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv758, R.dtype("float16"))
        cls.add5(alloc465, alloc472, alloc473)
        R.vm.kill_object(alloc465)
        R.vm.kill_object(alloc472)
        model_decoder_layers_27_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1153]
        model_decoder_layers_27_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1154]
        gv759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv759, R.dtype("float16"))
        cls.layer_norm2(alloc473, model_decoder_layers_27_encoder_attn_layer_norm_weight2, model_decoder_layers_27_encoder_attn_layer_norm_bias2, alloc474)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias2)
        model_decoder_layers_27_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149]
        model_decoder_layers_27_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1150]
        gv760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv760, R.dtype("float16"))
        _473: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight2, alloc474, model_decoder_layers_27_encoder_attn_q_proj_bias2, alloc475)
        R.vm.kill_object(alloc474)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias2)
        gv761: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape663: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc475, gv761, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc475)
        gv762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape664: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape663, gv762, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape663)
        gv763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv763, R.dtype("float16"))
        _474: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape664, alloc476)
        R.vm.kill_object(reshape664)
        gv764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape665: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc476, gv764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc476)
        gv765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape666: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape665, gv765, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape665)
        model_decoder_layers_27_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151]
        model_decoder_layers_27_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1152]
        gv766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv766, R.dtype("float16"))
        _475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight2, reshape666, model_decoder_layers_27_encoder_attn_out_proj_bias2, alloc477)
        R.vm.kill_object(reshape666)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias2)
        gv767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv767, R.dtype("float16"))
        cls.add5(alloc473, alloc477, alloc478)
        R.vm.kill_object(alloc473)
        R.vm.kill_object(alloc477)
        model_decoder_layers_27_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1159]
        model_decoder_layers_27_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1160]
        gv768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv768, R.dtype("float16"))
        cls.layer_norm2(alloc478, model_decoder_layers_27_final_layer_norm_weight2, model_decoder_layers_27_final_layer_norm_bias2, alloc479)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias2)
        model_decoder_layers_27_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155]
        model_decoder_layers_27_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1156]
        gv769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv769, R.dtype("float16"))
        _478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight2, alloc479, model_decoder_layers_27_fc1_bias2, alloc480)
        R.vm.kill_object(alloc479)
        R.vm.kill_object(model_decoder_layers_27_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_27_fc1_bias2)
        model_decoder_layers_27_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157]
        model_decoder_layers_27_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1158]
        gv770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv770, R.dtype("float16"))
        _479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight2, alloc480, model_decoder_layers_27_fc2_bias2, alloc481)
        R.vm.kill_object(alloc480)
        R.vm.kill_object(model_decoder_layers_27_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_27_fc2_bias2)
        gv771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc482: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv771, R.dtype("float16"))
        cls.add5(alloc478, alloc481, alloc482)
        R.vm.kill_object(alloc478)
        R.vm.kill_object(alloc481)
        model_decoder_layers_28_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1168]
        model_decoder_layers_28_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1169]
        gv772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv772, R.dtype("float16"))
        cls.layer_norm2(alloc482, model_decoder_layers_28_self_attn_layer_norm_weight2, model_decoder_layers_28_self_attn_layer_norm_bias2, alloc483)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias2)
        model_decoder_layers_28_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164]
        model_decoder_layers_28_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1165]
        gv773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv773, R.dtype("float16"))
        _482: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight2, alloc483, model_decoder_layers_28_self_attn_q_proj_bias2, alloc484)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias2)
        gv774: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape667: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc484, gv774, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc484)
        model_decoder_layers_28_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161]
        gv775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv775, R.dtype("float16"))
        _483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight2, alloc483, alloc485)
        R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight2)
        gv776: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape668: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc485, gv776, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc485)
        model_decoder_layers_28_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162]
        model_decoder_layers_28_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1163]
        gv777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv777, R.dtype("float16"))
        _484: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight2, alloc483, model_decoder_layers_28_self_attn_v_proj_bias2, alloc486)
        R.vm.kill_object(alloc483)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias2)
        gv778: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape669: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc486, gv778, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc486)
        gv779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc487: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv779, R.dtype("float16"))
        cls.concatenate1(reshape667, reshape668, reshape669, alloc487)
        R.vm.kill_object(reshape667)
        R.vm.kill_object(reshape668)
        R.vm.kill_object(reshape669)
        gv780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape670: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc487, gv780, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc487)
        gv781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv781, R.dtype("float16"))
        _486: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape670, alloc488)
        R.vm.kill_object(reshape670)
        gv782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape671: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc488, gv782, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc488)
        gv783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape672: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape671, gv783, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape671)
        model_decoder_layers_28_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166]
        model_decoder_layers_28_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1167]
        gv784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv784, R.dtype("float16"))
        _487: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight2, reshape672, model_decoder_layers_28_self_attn_out_proj_bias2, alloc489)
        R.vm.kill_object(reshape672)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias2)
        gv785: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv785, R.dtype("float16"))
        cls.add5(alloc482, alloc489, alloc490)
        R.vm.kill_object(alloc482)
        R.vm.kill_object(alloc489)
        model_decoder_layers_28_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1177]
        model_decoder_layers_28_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1178]
        gv786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv786, R.dtype("float16"))
        cls.layer_norm2(alloc490, model_decoder_layers_28_encoder_attn_layer_norm_weight2, model_decoder_layers_28_encoder_attn_layer_norm_bias2, alloc491)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias2)
        model_decoder_layers_28_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173]
        model_decoder_layers_28_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1174]
        gv787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv787, R.dtype("float16"))
        _490: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight2, alloc491, model_decoder_layers_28_encoder_attn_q_proj_bias2, alloc492)
        R.vm.kill_object(alloc491)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias2)
        gv788: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape673: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc492, gv788, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc492)
        gv789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape674: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape673, gv789, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape673)
        gv790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv790, R.dtype("float16"))
        _491: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape674, alloc493)
        R.vm.kill_object(reshape674)
        gv791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape675: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc493, gv791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc493)
        gv792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape676: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape675, gv792, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape675)
        model_decoder_layers_28_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175]
        model_decoder_layers_28_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1176]
        gv793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv793, R.dtype("float16"))
        _492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight2, reshape676, model_decoder_layers_28_encoder_attn_out_proj_bias2, alloc494)
        R.vm.kill_object(reshape676)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias2)
        gv794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv794, R.dtype("float16"))
        cls.add5(alloc490, alloc494, alloc495)
        R.vm.kill_object(alloc490)
        R.vm.kill_object(alloc494)
        model_decoder_layers_28_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1183]
        model_decoder_layers_28_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1184]
        gv795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv795, R.dtype("float16"))
        cls.layer_norm2(alloc495, model_decoder_layers_28_final_layer_norm_weight2, model_decoder_layers_28_final_layer_norm_bias2, alloc496)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias2)
        model_decoder_layers_28_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179]
        model_decoder_layers_28_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1180]
        gv796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv796, R.dtype("float16"))
        _495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight2, alloc496, model_decoder_layers_28_fc1_bias2, alloc497)
        R.vm.kill_object(alloc496)
        R.vm.kill_object(model_decoder_layers_28_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_28_fc1_bias2)
        model_decoder_layers_28_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181]
        model_decoder_layers_28_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1182]
        gv797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv797, R.dtype("float16"))
        _496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight2, alloc497, model_decoder_layers_28_fc2_bias2, alloc498)
        R.vm.kill_object(alloc497)
        R.vm.kill_object(model_decoder_layers_28_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_28_fc2_bias2)
        gv798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc499: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv798, R.dtype("float16"))
        cls.add5(alloc495, alloc498, alloc499)
        R.vm.kill_object(alloc495)
        R.vm.kill_object(alloc498)
        model_decoder_layers_29_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1192]
        model_decoder_layers_29_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1193]
        gv799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv799, R.dtype("float16"))
        cls.layer_norm2(alloc499, model_decoder_layers_29_self_attn_layer_norm_weight2, model_decoder_layers_29_self_attn_layer_norm_bias2, alloc500)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias2)
        model_decoder_layers_29_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188]
        model_decoder_layers_29_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1189]
        gv800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv800, R.dtype("float16"))
        _499: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight2, alloc500, model_decoder_layers_29_self_attn_q_proj_bias2, alloc501)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias2)
        gv801: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape677: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc501, gv801, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc501)
        model_decoder_layers_29_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185]
        gv802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv802, R.dtype("float16"))
        _500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight2, alloc500, alloc502)
        R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight2)
        gv803: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape678: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc502, gv803, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc502)
        model_decoder_layers_29_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186]
        model_decoder_layers_29_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1187]
        gv804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv804, R.dtype("float16"))
        _501: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight2, alloc500, model_decoder_layers_29_self_attn_v_proj_bias2, alloc503)
        R.vm.kill_object(alloc500)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias2)
        gv805: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape679: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc503, gv805, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc503)
        gv806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc504: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv806, R.dtype("float16"))
        cls.concatenate1(reshape677, reshape678, reshape679, alloc504)
        R.vm.kill_object(reshape677)
        R.vm.kill_object(reshape678)
        R.vm.kill_object(reshape679)
        gv807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape680: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc504, gv807, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc504)
        gv808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv808, R.dtype("float16"))
        _503: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape680, alloc505)
        R.vm.kill_object(reshape680)
        gv809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape681: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc505, gv809, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc505)
        gv810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape682: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape681, gv810, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape681)
        model_decoder_layers_29_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190]
        model_decoder_layers_29_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1191]
        gv811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv811, R.dtype("float16"))
        _504: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight2, reshape682, model_decoder_layers_29_self_attn_out_proj_bias2, alloc506)
        R.vm.kill_object(reshape682)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias2)
        gv812: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv812, R.dtype("float16"))
        cls.add5(alloc499, alloc506, alloc507)
        R.vm.kill_object(alloc499)
        R.vm.kill_object(alloc506)
        model_decoder_layers_29_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1201]
        model_decoder_layers_29_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1202]
        gv813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv813, R.dtype("float16"))
        cls.layer_norm2(alloc507, model_decoder_layers_29_encoder_attn_layer_norm_weight2, model_decoder_layers_29_encoder_attn_layer_norm_bias2, alloc508)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias2)
        model_decoder_layers_29_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197]
        model_decoder_layers_29_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1198]
        gv814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv814, R.dtype("float16"))
        _507: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight2, alloc508, model_decoder_layers_29_encoder_attn_q_proj_bias2, alloc509)
        R.vm.kill_object(alloc508)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias2)
        gv815: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape683: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc509, gv815, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc509)
        gv816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape684: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape683, gv816, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape683)
        gv817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv817, R.dtype("float16"))
        _508: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape684, alloc510)
        R.vm.kill_object(reshape684)
        gv818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape685: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc510, gv818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc510)
        gv819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape686: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape685, gv819, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape685)
        model_decoder_layers_29_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199]
        model_decoder_layers_29_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1200]
        gv820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv820, R.dtype("float16"))
        _509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight2, reshape686, model_decoder_layers_29_encoder_attn_out_proj_bias2, alloc511)
        R.vm.kill_object(reshape686)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias2)
        gv821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv821, R.dtype("float16"))
        cls.add5(alloc507, alloc511, alloc512)
        R.vm.kill_object(alloc507)
        R.vm.kill_object(alloc511)
        model_decoder_layers_29_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1207]
        model_decoder_layers_29_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1208]
        gv822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv822, R.dtype("float16"))
        cls.layer_norm2(alloc512, model_decoder_layers_29_final_layer_norm_weight2, model_decoder_layers_29_final_layer_norm_bias2, alloc513)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias2)
        model_decoder_layers_29_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203]
        model_decoder_layers_29_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1204]
        gv823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv823, R.dtype("float16"))
        _512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight2, alloc513, model_decoder_layers_29_fc1_bias2, alloc514)
        R.vm.kill_object(alloc513)
        R.vm.kill_object(model_decoder_layers_29_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_29_fc1_bias2)
        model_decoder_layers_29_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205]
        model_decoder_layers_29_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1206]
        gv824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv824, R.dtype("float16"))
        _513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight2, alloc514, model_decoder_layers_29_fc2_bias2, alloc515)
        R.vm.kill_object(alloc514)
        R.vm.kill_object(model_decoder_layers_29_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_29_fc2_bias2)
        gv825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc516: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv825, R.dtype("float16"))
        cls.add5(alloc512, alloc515, alloc516)
        R.vm.kill_object(alloc512)
        R.vm.kill_object(alloc515)
        model_decoder_layers_30_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1216]
        model_decoder_layers_30_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1217]
        gv826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv826, R.dtype("float16"))
        cls.layer_norm2(alloc516, model_decoder_layers_30_self_attn_layer_norm_weight2, model_decoder_layers_30_self_attn_layer_norm_bias2, alloc517)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias2)
        model_decoder_layers_30_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212]
        model_decoder_layers_30_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1213]
        gv827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv827, R.dtype("float16"))
        _516: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight2, alloc517, model_decoder_layers_30_self_attn_q_proj_bias2, alloc518)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias2)
        gv828: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape687: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc518, gv828, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc518)
        model_decoder_layers_30_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209]
        gv829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv829, R.dtype("float16"))
        _517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight2, alloc517, alloc519)
        R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight2)
        gv830: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape688: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc519, gv830, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc519)
        model_decoder_layers_30_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210]
        model_decoder_layers_30_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1211]
        gv831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv831, R.dtype("float16"))
        _518: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight2, alloc517, model_decoder_layers_30_self_attn_v_proj_bias2, alloc520)
        R.vm.kill_object(alloc517)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias2)
        gv832: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape689: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc520, gv832, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc520)
        gv833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc521: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv833, R.dtype("float16"))
        cls.concatenate1(reshape687, reshape688, reshape689, alloc521)
        R.vm.kill_object(reshape687)
        R.vm.kill_object(reshape688)
        R.vm.kill_object(reshape689)
        gv834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape690: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc521, gv834, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc521)
        gv835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv835, R.dtype("float16"))
        _520: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape690, alloc522)
        R.vm.kill_object(reshape690)
        gv836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape691: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc522, gv836, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc522)
        gv837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape692: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape691, gv837, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape691)
        model_decoder_layers_30_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214]
        model_decoder_layers_30_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1215]
        gv838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv838, R.dtype("float16"))
        _521: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight2, reshape692, model_decoder_layers_30_self_attn_out_proj_bias2, alloc523)
        R.vm.kill_object(reshape692)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias2)
        gv839: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv839, R.dtype("float16"))
        cls.add5(alloc516, alloc523, alloc524)
        R.vm.kill_object(alloc516)
        R.vm.kill_object(alloc523)
        model_decoder_layers_30_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1225]
        model_decoder_layers_30_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1226]
        gv840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv840, R.dtype("float16"))
        cls.layer_norm2(alloc524, model_decoder_layers_30_encoder_attn_layer_norm_weight2, model_decoder_layers_30_encoder_attn_layer_norm_bias2, alloc525)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias2)
        model_decoder_layers_30_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221]
        model_decoder_layers_30_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1222]
        gv841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv841, R.dtype("float16"))
        _524: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight2, alloc525, model_decoder_layers_30_encoder_attn_q_proj_bias2, alloc526)
        R.vm.kill_object(alloc525)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias2)
        gv842: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape693: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc526, gv842, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc526)
        gv843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape694: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape693, gv843, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape693)
        gv844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv844, R.dtype("float16"))
        _525: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape694, alloc527)
        R.vm.kill_object(reshape694)
        gv845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape695: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc527, gv845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc527)
        gv846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape696: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape695, gv846, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape695)
        model_decoder_layers_30_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223]
        model_decoder_layers_30_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1224]
        gv847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv847, R.dtype("float16"))
        _526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight2, reshape696, model_decoder_layers_30_encoder_attn_out_proj_bias2, alloc528)
        R.vm.kill_object(reshape696)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias2)
        gv848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv848, R.dtype("float16"))
        cls.add5(alloc524, alloc528, alloc529)
        R.vm.kill_object(alloc524)
        R.vm.kill_object(alloc528)
        model_decoder_layers_30_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1231]
        model_decoder_layers_30_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1232]
        gv849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc530: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv849, R.dtype("float16"))
        cls.layer_norm2(alloc529, model_decoder_layers_30_final_layer_norm_weight2, model_decoder_layers_30_final_layer_norm_bias2, alloc530)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias2)
        model_decoder_layers_30_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227]
        model_decoder_layers_30_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1228]
        gv850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc531: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv850, R.dtype("float16"))
        _529: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight2, alloc530, model_decoder_layers_30_fc1_bias2, alloc531)
        R.vm.kill_object(alloc530)
        R.vm.kill_object(model_decoder_layers_30_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_30_fc1_bias2)
        model_decoder_layers_30_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229]
        model_decoder_layers_30_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1230]
        gv851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc532: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv851, R.dtype("float16"))
        _530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight2, alloc531, model_decoder_layers_30_fc2_bias2, alloc532)
        R.vm.kill_object(alloc531)
        R.vm.kill_object(model_decoder_layers_30_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_30_fc2_bias2)
        gv852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc533: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv852, R.dtype("float16"))
        cls.add5(alloc529, alloc532, alloc533)
        R.vm.kill_object(alloc529)
        R.vm.kill_object(alloc532)
        model_decoder_layers_31_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1240]
        model_decoder_layers_31_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1241]
        gv853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc534: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv853, R.dtype("float16"))
        cls.layer_norm2(alloc533, model_decoder_layers_31_self_attn_layer_norm_weight2, model_decoder_layers_31_self_attn_layer_norm_bias2, alloc534)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias2)
        model_decoder_layers_31_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236]
        model_decoder_layers_31_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1237]
        gv854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc535: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv854, R.dtype("float16"))
        _533: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight2, alloc534, model_decoder_layers_31_self_attn_q_proj_bias2, alloc535)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias2)
        gv855: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape697: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc535, gv855, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc535)
        model_decoder_layers_31_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233]
        gv856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc536: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv856, R.dtype("float16"))
        _534: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight2, alloc534, alloc536)
        R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight2)
        gv857: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape698: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc536, gv857, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc536)
        model_decoder_layers_31_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234]
        model_decoder_layers_31_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1235]
        gv858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc537: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv858, R.dtype("float16"))
        _535: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight2, alloc534, model_decoder_layers_31_self_attn_v_proj_bias2, alloc537)
        R.vm.kill_object(alloc534)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight2)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias2)
        gv859: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape699: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc537, gv859, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc537)
        gv860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc538: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv860, R.dtype("float16"))
        cls.concatenate1(reshape697, reshape698, reshape699, alloc538)
        R.vm.kill_object(reshape697)
        R.vm.kill_object(reshape698)
        R.vm.kill_object(reshape699)
        gv861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape700: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc538, gv861, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc538)
        gv862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc539: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv862, R.dtype("float16"))
        _537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape700, alloc539)
        R.vm.kill_object(reshape700)
        gv863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape701: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc539, gv863, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc539)
        gv864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape702: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape701, gv864, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape701)
        model_decoder_layers_31_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238]
        model_decoder_layers_31_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1239]
        gv865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc540: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv865, R.dtype("float16"))
        _538: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight2, reshape702, model_decoder_layers_31_self_attn_out_proj_bias2, alloc540)
        R.vm.kill_object(reshape702)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias2)
        gv866: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc541: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv866, R.dtype("float16"))
        cls.add5(alloc533, alloc540, alloc541)
        R.vm.kill_object(alloc533)
        R.vm.kill_object(alloc540)
        model_decoder_layers_31_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1249]
        model_decoder_layers_31_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1250]
        gv867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc542: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv867, R.dtype("float16"))
        cls.layer_norm2(alloc541, model_decoder_layers_31_encoder_attn_layer_norm_weight2, model_decoder_layers_31_encoder_attn_layer_norm_bias2, alloc542)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias2)
        model_decoder_layers_31_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245]
        model_decoder_layers_31_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1246]
        gv868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc543: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv868, R.dtype("float16"))
        _541: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight2, alloc542, model_decoder_layers_31_encoder_attn_q_proj_bias2, alloc543)
        R.vm.kill_object(alloc542)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight2)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias2)
        gv869: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape703: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc543, gv869, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc543)
        gv870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape704: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape703, gv870, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape703)
        gv871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc544: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv871, R.dtype("float16"))
        _542: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape704, alloc544)
        R.vm.kill_object(reshape704)
        gv872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape705: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc544, gv872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc544)
        gv873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape706: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape705, gv873, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape705)
        model_decoder_layers_31_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247]
        model_decoder_layers_31_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1248]
        gv874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc545: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv874, R.dtype("float16"))
        _543: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight2, reshape706, model_decoder_layers_31_encoder_attn_out_proj_bias2, alloc545)
        R.vm.kill_object(reshape706)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight2)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias2)
        gv875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc546: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv875, R.dtype("float16"))
        R.vm.kill_object(storage6)
        cls.add5(alloc541, alloc545, alloc546)
        R.vm.kill_object(alloc541)
        R.vm.kill_object(alloc545)
        model_decoder_layers_31_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1255]
        model_decoder_layers_31_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1256]
        gv876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc547: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv876, R.dtype("float16"))
        cls.layer_norm2(alloc546, model_decoder_layers_31_final_layer_norm_weight2, model_decoder_layers_31_final_layer_norm_bias2, alloc547)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias2)
        model_decoder_layers_31_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251]
        model_decoder_layers_31_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1252]
        gv877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc548: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv877, R.dtype("float16"))
        R.vm.kill_object(storage4)
        _546: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight2, alloc547, model_decoder_layers_31_fc1_bias2, alloc548)
        R.vm.kill_object(alloc547)
        R.vm.kill_object(model_decoder_layers_31_fc1_weight2)
        R.vm.kill_object(model_decoder_layers_31_fc1_bias2)
        model_decoder_layers_31_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253]
        model_decoder_layers_31_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1254]
        gv878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc549: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv878, R.dtype("float16"))
        R.vm.kill_object(storage5)
        _547: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight2, alloc548, model_decoder_layers_31_fc2_bias2, alloc549)
        R.vm.kill_object(alloc548)
        R.vm.kill_object(model_decoder_layers_31_fc2_weight2)
        R.vm.kill_object(model_decoder_layers_31_fc2_bias2)
        gv879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc550: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv879, R.dtype("float16"))
        R.vm.kill_object(storage7)
        cls.add5(alloc546, alloc549, alloc550)
        R.vm.kill_object(alloc546)
        R.vm.kill_object(alloc549)
        model_decoder_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1257]
        model_decoder_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1258]
        gv880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc551: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv880, R.dtype("float16"))
        R.vm.kill_object(storage8)
        cls.layer_norm2(alloc550, model_decoder_layer_norm_weight2, model_decoder_layer_norm_bias2, alloc551)
        R.vm.kill_object(alloc550)
        R.vm.kill_object(model_decoder_layer_norm_weight2)
        R.vm.kill_object(model_decoder_layer_norm_bias2)
        storage9: R.Object = R.vm.alloc_storage(R.shape([20480]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc552: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage9, R.prim_value(0), gv881, R.dtype("float16"))
        R.vm.kill_object(storage9)
        cls.take2(alloc551, logit_positions, alloc552)
        R.vm.kill_object(alloc551)
        storage10: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),))
        alloc553: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage10, R.prim_value(0), gv882, R.dtype("float32"))
        R.vm.kill_object(storage10)
        _551: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul5_cublas", model_decoder_embed_tokens_weight2, alloc552, alloc553)
        R.vm.kill_object(model_decoder_embed_tokens_weight2)
        R.vm.kill_object(alloc552)
        R.call_packed("vm.builtin.match_shape", alloc553, shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_prefill, loc=return, annotation=R.Tensor((1, batch_size, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        return alloc553

    @R.function
    def create_tir_paged_kv_cache(max_batch_size_: R.Shape(["max_batch_size"]), max_total_seq_len_: R.Shape(["max_total_seq_len"]), prefill_chunk_size_: R.Shape(["prefill_chunk_size"]), page_size_: R.Shape(["page_size"]), support_sliding_window_: R.Shape(["support_sliding_window"])) -> R.Object:
        max_batch_size = T.int64()
        max_total_seq_len = T.int64()
        prefill_chunk_size = T.int64()
        page_size = T.int64()
        support_sliding_window = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_shape_info", max_batch_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_shape_info", max_total_seq_len_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_shape_info", prefill_chunk_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_shape_info", page_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_shape_info", support_sliding_window_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", max_batch_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", max_total_seq_len_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", prefill_chunk_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", page_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", support_sliding_window_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,))
        gv2559: R.Shape(ndim=5) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(5), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(2), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=5),))
        paged_kv_cache: R.Object = R.call_packed("vm.builtin.paged_attention_kv_cache_create_reduced", gv2559, R.prim_value(32), R.prim_value(20), R.prim_value(20), R.prim_value(64), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.const(0, "float16"), cls.tir_kv_cache_transpose_append, cls.batch_prefill_paged_kv, cls.batch_decode_paged_kv, cls.batch_prefill_paged_kv_sliding_window, cls.batch_decode_paged_kv_sliding_window, cls.batch_prefill_ragged_kv, cls.merge_state_inplace, cls.fused_rope, cls.copy_single_page, cls.tir_kv_cache_debug_get_kv, cls.compact_kv_copy, cls.batch_tree_attn, sinfo_args=(R.Object,))
        return paged_kv_cache

    @R.function
    def decode(input_ids: R.Tensor((1, 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"):
        R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(1),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        model_decoder_embed_tokens_weight5: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        reshape1353: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, R.shape([1]), sinfo_args=(R.Tensor((1,), dtype="int32"),))
        model_decoder_embed_tokens_weight5_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        storage19: R.Object = R.vm.alloc_storage(R.shape([10240]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1167: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16"))
        cls.take3(model_decoder_embed_tokens_weight5_1, reshape1353, alloc1167)
        R.vm.kill_object(reshape1353)
        R.vm.kill_object(model_decoder_embed_tokens_weight5_1)
        lv264: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((1,), dtype="int32"),))
        model_decoder_embed_positions_weight5: R.Tensor((448, 1280), dtype="float16") = packed_params[488]
        storage20: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1168: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16"))
        cls.take4(model_decoder_embed_positions_weight5, lv264, alloc1168)
        R.vm.kill_object(lv264)
        R.vm.kill_object(model_decoder_embed_positions_weight5)
        storage21: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1169: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_reshape20_reshape20_add6(alloc1167, alloc1168, alloc1169)
        R.vm.kill_object(alloc1167)
        R.vm.kill_object(alloc1168)
        model_decoder_layers_0_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[496]
        model_decoder_layers_0_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[497]
        alloc1170: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1169, model_decoder_layers_0_self_attn_layer_norm_weight5, model_decoder_layers_0_self_attn_layer_norm_bias5, alloc1170)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias5)
        model_decoder_layers_0_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[492]
        model_decoder_layers_0_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[493]
        alloc1171: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_q_proj_weight5, model_decoder_layers_0_self_attn_q_proj_bias5, alloc1171)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias5)
        model_decoder_layers_0_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[489]
        storage22: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1172: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1170, model_decoder_layers_0_self_attn_k_proj_weight5, alloc1172)
        R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight5)
        model_decoder_layers_0_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[490]
        model_decoder_layers_0_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[491]
        storage23: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1173: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_v_proj_weight5, model_decoder_layers_0_self_attn_v_proj_bias5, alloc1173)
        R.vm.kill_object(alloc1170)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias5)
        alloc1174: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1171, alloc1172, alloc1173, alloc1174)
        R.vm.kill_object(alloc1171)
        R.vm.kill_object(alloc1172)
        R.vm.kill_object(alloc1173)
        alloc1175: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1173: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), alloc1174, alloc1175)
        R.vm.kill_object(alloc1174)
        lv44: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1175, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1175)
        model_decoder_layers_0_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[494]
        model_decoder_layers_0_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[495]
        alloc1176: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv44, model_decoder_layers_0_self_attn_out_proj_weight5, model_decoder_layers_0_self_attn_out_proj_bias5, alloc1169, alloc1176)
        R.vm.kill_object(alloc1169)
        R.vm.kill_object(lv44)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias5)
        model_decoder_layers_0_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[505]
        model_decoder_layers_0_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[506]
        alloc1177: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1176, model_decoder_layers_0_encoder_attn_layer_norm_weight5, model_decoder_layers_0_encoder_attn_layer_norm_bias5, alloc1177)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias5)
        model_decoder_layers_0_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[501]
        model_decoder_layers_0_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[502]
        alloc1178: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1177, model_decoder_layers_0_encoder_attn_q_proj_weight5, model_decoder_layers_0_encoder_attn_q_proj_bias5, alloc1178)
        R.vm.kill_object(alloc1177)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias5)
        lv47: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1178, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1178)
        alloc1179: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1177: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), lv47, alloc1179)
        R.vm.kill_object(lv47)
        lv48: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1179, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1179)
        model_decoder_layers_0_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[503]
        model_decoder_layers_0_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[504]
        alloc1180: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv48, model_decoder_layers_0_encoder_attn_out_proj_weight5, model_decoder_layers_0_encoder_attn_out_proj_bias5, alloc1176, alloc1180)
        R.vm.kill_object(alloc1176)
        R.vm.kill_object(lv48)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias5)
        model_decoder_layers_0_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[511]
        model_decoder_layers_0_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[512]
        alloc1181: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1180, model_decoder_layers_0_final_layer_norm_weight5, model_decoder_layers_0_final_layer_norm_bias5, alloc1181)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias5)
        model_decoder_layers_0_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[507]
        model_decoder_layers_0_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[508]
        alloc1182: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1181, model_decoder_layers_0_fc1_weight5, model_decoder_layers_0_fc1_bias5, alloc1182)
        R.vm.kill_object(alloc1181)
        R.vm.kill_object(model_decoder_layers_0_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_0_fc1_bias5)
        model_decoder_layers_0_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[509]
        model_decoder_layers_0_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[510]
        alloc1183: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1182, model_decoder_layers_0_fc2_weight5, model_decoder_layers_0_fc2_bias5, alloc1180, alloc1183)
        R.vm.kill_object(alloc1180)
        R.vm.kill_object(alloc1182)
        R.vm.kill_object(model_decoder_layers_0_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_0_fc2_bias5)
        model_decoder_layers_1_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[520]
        model_decoder_layers_1_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[521]
        alloc1184: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1183, model_decoder_layers_1_self_attn_layer_norm_weight5, model_decoder_layers_1_self_attn_layer_norm_bias5, alloc1184)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias5)
        model_decoder_layers_1_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[516]
        model_decoder_layers_1_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[517]
        alloc1185: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_q_proj_weight5, model_decoder_layers_1_self_attn_q_proj_bias5, alloc1185)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias5)
        model_decoder_layers_1_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[513]
        alloc1186: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1184, model_decoder_layers_1_self_attn_k_proj_weight5, alloc1186)
        R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight5)
        model_decoder_layers_1_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[514]
        model_decoder_layers_1_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[515]
        alloc1187: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_v_proj_weight5, model_decoder_layers_1_self_attn_v_proj_bias5, alloc1187)
        R.vm.kill_object(alloc1184)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias5)
        alloc1188: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1185, alloc1186, alloc1187, alloc1188)
        R.vm.kill_object(alloc1185)
        R.vm.kill_object(alloc1186)
        R.vm.kill_object(alloc1187)
        alloc1189: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1187: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), alloc1188, alloc1189)
        R.vm.kill_object(alloc1188)
        lv55: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1189, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1189)
        model_decoder_layers_1_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[518]
        model_decoder_layers_1_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[519]
        alloc1190: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv55, model_decoder_layers_1_self_attn_out_proj_weight5, model_decoder_layers_1_self_attn_out_proj_bias5, alloc1183, alloc1190)
        R.vm.kill_object(alloc1183)
        R.vm.kill_object(lv55)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias5)
        model_decoder_layers_1_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[529]
        model_decoder_layers_1_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[530]
        alloc1191: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1190, model_decoder_layers_1_encoder_attn_layer_norm_weight5, model_decoder_layers_1_encoder_attn_layer_norm_bias5, alloc1191)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias5)
        model_decoder_layers_1_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[525]
        model_decoder_layers_1_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[526]
        alloc1192: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1191, model_decoder_layers_1_encoder_attn_q_proj_weight5, model_decoder_layers_1_encoder_attn_q_proj_bias5, alloc1192)
        R.vm.kill_object(alloc1191)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias5)
        lv58: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1192, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1192)
        alloc1193: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1191: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), lv58, alloc1193)
        R.vm.kill_object(lv58)
        lv59: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1193, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1193)
        model_decoder_layers_1_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[527]
        model_decoder_layers_1_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[528]
        alloc1194: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv59, model_decoder_layers_1_encoder_attn_out_proj_weight5, model_decoder_layers_1_encoder_attn_out_proj_bias5, alloc1190, alloc1194)
        R.vm.kill_object(alloc1190)
        R.vm.kill_object(lv59)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias5)
        model_decoder_layers_1_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[535]
        model_decoder_layers_1_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[536]
        alloc1195: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1194, model_decoder_layers_1_final_layer_norm_weight5, model_decoder_layers_1_final_layer_norm_bias5, alloc1195)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias5)
        model_decoder_layers_1_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[531]
        model_decoder_layers_1_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[532]
        alloc1196: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1195, model_decoder_layers_1_fc1_weight5, model_decoder_layers_1_fc1_bias5, alloc1196)
        R.vm.kill_object(alloc1195)
        R.vm.kill_object(model_decoder_layers_1_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_1_fc1_bias5)
        model_decoder_layers_1_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[533]
        model_decoder_layers_1_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[534]
        alloc1197: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1196, model_decoder_layers_1_fc2_weight5, model_decoder_layers_1_fc2_bias5, alloc1194, alloc1197)
        R.vm.kill_object(alloc1194)
        R.vm.kill_object(alloc1196)
        R.vm.kill_object(model_decoder_layers_1_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_1_fc2_bias5)
        model_decoder_layers_2_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[544]
        model_decoder_layers_2_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[545]
        alloc1198: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1197, model_decoder_layers_2_self_attn_layer_norm_weight5, model_decoder_layers_2_self_attn_layer_norm_bias5, alloc1198)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias5)
        model_decoder_layers_2_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[540]
        model_decoder_layers_2_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[541]
        alloc1199: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_q_proj_weight5, model_decoder_layers_2_self_attn_q_proj_bias5, alloc1199)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias5)
        model_decoder_layers_2_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[537]
        alloc1200: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1198, model_decoder_layers_2_self_attn_k_proj_weight5, alloc1200)
        R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight5)
        model_decoder_layers_2_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[538]
        model_decoder_layers_2_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[539]
        alloc1201: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_v_proj_weight5, model_decoder_layers_2_self_attn_v_proj_bias5, alloc1201)
        R.vm.kill_object(alloc1198)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias5)
        alloc1202: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1199, alloc1200, alloc1201, alloc1202)
        R.vm.kill_object(alloc1199)
        R.vm.kill_object(alloc1200)
        R.vm.kill_object(alloc1201)
        alloc1203: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1201: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), alloc1202, alloc1203)
        R.vm.kill_object(alloc1202)
        lv66: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1203, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1203)
        model_decoder_layers_2_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[542]
        model_decoder_layers_2_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[543]
        alloc1204: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv66, model_decoder_layers_2_self_attn_out_proj_weight5, model_decoder_layers_2_self_attn_out_proj_bias5, alloc1197, alloc1204)
        R.vm.kill_object(alloc1197)
        R.vm.kill_object(lv66)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias5)
        model_decoder_layers_2_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[553]
        model_decoder_layers_2_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[554]
        alloc1205: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1204, model_decoder_layers_2_encoder_attn_layer_norm_weight5, model_decoder_layers_2_encoder_attn_layer_norm_bias5, alloc1205)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias5)
        model_decoder_layers_2_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[549]
        model_decoder_layers_2_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[550]
        alloc1206: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1205, model_decoder_layers_2_encoder_attn_q_proj_weight5, model_decoder_layers_2_encoder_attn_q_proj_bias5, alloc1206)
        R.vm.kill_object(alloc1205)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias5)
        lv69: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1206, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1206)
        alloc1207: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1205: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), lv69, alloc1207)
        R.vm.kill_object(lv69)
        lv70: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1207, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1207)
        model_decoder_layers_2_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[551]
        model_decoder_layers_2_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[552]
        alloc1208: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv70, model_decoder_layers_2_encoder_attn_out_proj_weight5, model_decoder_layers_2_encoder_attn_out_proj_bias5, alloc1204, alloc1208)
        R.vm.kill_object(alloc1204)
        R.vm.kill_object(lv70)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias5)
        model_decoder_layers_2_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[559]
        model_decoder_layers_2_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[560]
        alloc1209: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1208, model_decoder_layers_2_final_layer_norm_weight5, model_decoder_layers_2_final_layer_norm_bias5, alloc1209)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias5)
        model_decoder_layers_2_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[555]
        model_decoder_layers_2_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[556]
        alloc1210: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1209, model_decoder_layers_2_fc1_weight5, model_decoder_layers_2_fc1_bias5, alloc1210)
        R.vm.kill_object(alloc1209)
        R.vm.kill_object(model_decoder_layers_2_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_2_fc1_bias5)
        model_decoder_layers_2_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[557]
        model_decoder_layers_2_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[558]
        alloc1211: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1210, model_decoder_layers_2_fc2_weight5, model_decoder_layers_2_fc2_bias5, alloc1208, alloc1211)
        R.vm.kill_object(alloc1208)
        R.vm.kill_object(alloc1210)
        R.vm.kill_object(model_decoder_layers_2_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_2_fc2_bias5)
        model_decoder_layers_3_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[568]
        model_decoder_layers_3_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[569]
        alloc1212: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1211, model_decoder_layers_3_self_attn_layer_norm_weight5, model_decoder_layers_3_self_attn_layer_norm_bias5, alloc1212)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias5)
        model_decoder_layers_3_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[564]
        model_decoder_layers_3_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[565]
        alloc1213: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_q_proj_weight5, model_decoder_layers_3_self_attn_q_proj_bias5, alloc1213)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias5)
        model_decoder_layers_3_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[561]
        alloc1214: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1212, model_decoder_layers_3_self_attn_k_proj_weight5, alloc1214)
        R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight5)
        model_decoder_layers_3_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[562]
        model_decoder_layers_3_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[563]
        alloc1215: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_v_proj_weight5, model_decoder_layers_3_self_attn_v_proj_bias5, alloc1215)
        R.vm.kill_object(alloc1212)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias5)
        alloc1216: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1213, alloc1214, alloc1215, alloc1216)
        R.vm.kill_object(alloc1213)
        R.vm.kill_object(alloc1214)
        R.vm.kill_object(alloc1215)
        alloc1217: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), alloc1216, alloc1217)
        R.vm.kill_object(alloc1216)
        lv77: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1217, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1217)
        model_decoder_layers_3_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[566]
        model_decoder_layers_3_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[567]
        alloc1218: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv77, model_decoder_layers_3_self_attn_out_proj_weight5, model_decoder_layers_3_self_attn_out_proj_bias5, alloc1211, alloc1218)
        R.vm.kill_object(alloc1211)
        R.vm.kill_object(lv77)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias5)
        model_decoder_layers_3_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[577]
        model_decoder_layers_3_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[578]
        alloc1219: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1218, model_decoder_layers_3_encoder_attn_layer_norm_weight5, model_decoder_layers_3_encoder_attn_layer_norm_bias5, alloc1219)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias5)
        model_decoder_layers_3_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[573]
        model_decoder_layers_3_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[574]
        alloc1220: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1219, model_decoder_layers_3_encoder_attn_q_proj_weight5, model_decoder_layers_3_encoder_attn_q_proj_bias5, alloc1220)
        R.vm.kill_object(alloc1219)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias5)
        lv80: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1220, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1220)
        alloc1221: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), lv80, alloc1221)
        R.vm.kill_object(lv80)
        lv81: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1221, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1221)
        model_decoder_layers_3_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[575]
        model_decoder_layers_3_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[576]
        alloc1222: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv81, model_decoder_layers_3_encoder_attn_out_proj_weight5, model_decoder_layers_3_encoder_attn_out_proj_bias5, alloc1218, alloc1222)
        R.vm.kill_object(alloc1218)
        R.vm.kill_object(lv81)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias5)
        model_decoder_layers_3_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[583]
        model_decoder_layers_3_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[584]
        alloc1223: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1222, model_decoder_layers_3_final_layer_norm_weight5, model_decoder_layers_3_final_layer_norm_bias5, alloc1223)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias5)
        model_decoder_layers_3_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[579]
        model_decoder_layers_3_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[580]
        alloc1224: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1223, model_decoder_layers_3_fc1_weight5, model_decoder_layers_3_fc1_bias5, alloc1224)
        R.vm.kill_object(alloc1223)
        R.vm.kill_object(model_decoder_layers_3_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_3_fc1_bias5)
        model_decoder_layers_3_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[581]
        model_decoder_layers_3_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[582]
        alloc1225: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1224, model_decoder_layers_3_fc2_weight5, model_decoder_layers_3_fc2_bias5, alloc1222, alloc1225)
        R.vm.kill_object(alloc1222)
        R.vm.kill_object(alloc1224)
        R.vm.kill_object(model_decoder_layers_3_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_3_fc2_bias5)
        model_decoder_layers_4_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[592]
        model_decoder_layers_4_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[593]
        alloc1226: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1225, model_decoder_layers_4_self_attn_layer_norm_weight5, model_decoder_layers_4_self_attn_layer_norm_bias5, alloc1226)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias5)
        model_decoder_layers_4_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[588]
        model_decoder_layers_4_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[589]
        alloc1227: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_q_proj_weight5, model_decoder_layers_4_self_attn_q_proj_bias5, alloc1227)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias5)
        model_decoder_layers_4_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[585]
        alloc1228: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1226, model_decoder_layers_4_self_attn_k_proj_weight5, alloc1228)
        R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight5)
        model_decoder_layers_4_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[586]
        model_decoder_layers_4_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[587]
        alloc1229: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_v_proj_weight5, model_decoder_layers_4_self_attn_v_proj_bias5, alloc1229)
        R.vm.kill_object(alloc1226)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias5)
        alloc1230: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1227, alloc1228, alloc1229, alloc1230)
        R.vm.kill_object(alloc1227)
        R.vm.kill_object(alloc1228)
        R.vm.kill_object(alloc1229)
        alloc1231: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1229: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), alloc1230, alloc1231)
        R.vm.kill_object(alloc1230)
        lv88: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1231, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1231)
        model_decoder_layers_4_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[590]
        model_decoder_layers_4_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[591]
        alloc1232: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv88, model_decoder_layers_4_self_attn_out_proj_weight5, model_decoder_layers_4_self_attn_out_proj_bias5, alloc1225, alloc1232)
        R.vm.kill_object(alloc1225)
        R.vm.kill_object(lv88)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias5)
        model_decoder_layers_4_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[601]
        model_decoder_layers_4_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[602]
        alloc1233: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1232, model_decoder_layers_4_encoder_attn_layer_norm_weight5, model_decoder_layers_4_encoder_attn_layer_norm_bias5, alloc1233)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias5)
        model_decoder_layers_4_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[597]
        model_decoder_layers_4_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[598]
        alloc1234: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1233, model_decoder_layers_4_encoder_attn_q_proj_weight5, model_decoder_layers_4_encoder_attn_q_proj_bias5, alloc1234)
        R.vm.kill_object(alloc1233)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias5)
        lv91: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1234, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1234)
        alloc1235: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1233: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), lv91, alloc1235)
        R.vm.kill_object(lv91)
        lv92: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1235, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1235)
        model_decoder_layers_4_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[599]
        model_decoder_layers_4_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[600]
        alloc1236: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv92, model_decoder_layers_4_encoder_attn_out_proj_weight5, model_decoder_layers_4_encoder_attn_out_proj_bias5, alloc1232, alloc1236)
        R.vm.kill_object(alloc1232)
        R.vm.kill_object(lv92)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias5)
        model_decoder_layers_4_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[607]
        model_decoder_layers_4_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[608]
        alloc1237: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1236, model_decoder_layers_4_final_layer_norm_weight5, model_decoder_layers_4_final_layer_norm_bias5, alloc1237)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias5)
        model_decoder_layers_4_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[603]
        model_decoder_layers_4_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[604]
        alloc1238: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1237, model_decoder_layers_4_fc1_weight5, model_decoder_layers_4_fc1_bias5, alloc1238)
        R.vm.kill_object(alloc1237)
        R.vm.kill_object(model_decoder_layers_4_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_4_fc1_bias5)
        model_decoder_layers_4_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[605]
        model_decoder_layers_4_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[606]
        alloc1239: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1238, model_decoder_layers_4_fc2_weight5, model_decoder_layers_4_fc2_bias5, alloc1236, alloc1239)
        R.vm.kill_object(alloc1236)
        R.vm.kill_object(alloc1238)
        R.vm.kill_object(model_decoder_layers_4_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_4_fc2_bias5)
        model_decoder_layers_5_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[616]
        model_decoder_layers_5_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[617]
        alloc1240: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1239, model_decoder_layers_5_self_attn_layer_norm_weight5, model_decoder_layers_5_self_attn_layer_norm_bias5, alloc1240)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias5)
        model_decoder_layers_5_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[612]
        model_decoder_layers_5_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[613]
        alloc1241: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_q_proj_weight5, model_decoder_layers_5_self_attn_q_proj_bias5, alloc1241)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias5)
        model_decoder_layers_5_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[609]
        alloc1242: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1240, model_decoder_layers_5_self_attn_k_proj_weight5, alloc1242)
        R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight5)
        model_decoder_layers_5_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[610]
        model_decoder_layers_5_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[611]
        alloc1243: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_v_proj_weight5, model_decoder_layers_5_self_attn_v_proj_bias5, alloc1243)
        R.vm.kill_object(alloc1240)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias5)
        alloc1244: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1241, alloc1242, alloc1243, alloc1244)
        R.vm.kill_object(alloc1241)
        R.vm.kill_object(alloc1242)
        R.vm.kill_object(alloc1243)
        alloc1245: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1243: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), alloc1244, alloc1245)
        R.vm.kill_object(alloc1244)
        lv99: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1245, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1245)
        model_decoder_layers_5_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[614]
        model_decoder_layers_5_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[615]
        alloc1246: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv99, model_decoder_layers_5_self_attn_out_proj_weight5, model_decoder_layers_5_self_attn_out_proj_bias5, alloc1239, alloc1246)
        R.vm.kill_object(alloc1239)
        R.vm.kill_object(lv99)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias5)
        model_decoder_layers_5_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[625]
        model_decoder_layers_5_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[626]
        alloc1247: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1246, model_decoder_layers_5_encoder_attn_layer_norm_weight5, model_decoder_layers_5_encoder_attn_layer_norm_bias5, alloc1247)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias5)
        model_decoder_layers_5_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[621]
        model_decoder_layers_5_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[622]
        alloc1248: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1247, model_decoder_layers_5_encoder_attn_q_proj_weight5, model_decoder_layers_5_encoder_attn_q_proj_bias5, alloc1248)
        R.vm.kill_object(alloc1247)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias5)
        lv102: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1248, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1248)
        alloc1249: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1247: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), lv102, alloc1249)
        R.vm.kill_object(lv102)
        lv103: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1249, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1249)
        model_decoder_layers_5_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[623]
        model_decoder_layers_5_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[624]
        alloc1250: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv103, model_decoder_layers_5_encoder_attn_out_proj_weight5, model_decoder_layers_5_encoder_attn_out_proj_bias5, alloc1246, alloc1250)
        R.vm.kill_object(alloc1246)
        R.vm.kill_object(lv103)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias5)
        model_decoder_layers_5_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[631]
        model_decoder_layers_5_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[632]
        alloc1251: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1250, model_decoder_layers_5_final_layer_norm_weight5, model_decoder_layers_5_final_layer_norm_bias5, alloc1251)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias5)
        model_decoder_layers_5_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[627]
        model_decoder_layers_5_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[628]
        alloc1252: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1251, model_decoder_layers_5_fc1_weight5, model_decoder_layers_5_fc1_bias5, alloc1252)
        R.vm.kill_object(alloc1251)
        R.vm.kill_object(model_decoder_layers_5_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_5_fc1_bias5)
        model_decoder_layers_5_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[629]
        model_decoder_layers_5_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[630]
        alloc1253: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1252, model_decoder_layers_5_fc2_weight5, model_decoder_layers_5_fc2_bias5, alloc1250, alloc1253)
        R.vm.kill_object(alloc1250)
        R.vm.kill_object(alloc1252)
        R.vm.kill_object(model_decoder_layers_5_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_5_fc2_bias5)
        model_decoder_layers_6_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[640]
        model_decoder_layers_6_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[641]
        alloc1254: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1253, model_decoder_layers_6_self_attn_layer_norm_weight5, model_decoder_layers_6_self_attn_layer_norm_bias5, alloc1254)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias5)
        model_decoder_layers_6_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[636]
        model_decoder_layers_6_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[637]
        alloc1255: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_q_proj_weight5, model_decoder_layers_6_self_attn_q_proj_bias5, alloc1255)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias5)
        model_decoder_layers_6_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[633]
        alloc1256: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1254, model_decoder_layers_6_self_attn_k_proj_weight5, alloc1256)
        R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight5)
        model_decoder_layers_6_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[634]
        model_decoder_layers_6_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[635]
        alloc1257: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_v_proj_weight5, model_decoder_layers_6_self_attn_v_proj_bias5, alloc1257)
        R.vm.kill_object(alloc1254)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias5)
        alloc1258: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1255, alloc1256, alloc1257, alloc1258)
        R.vm.kill_object(alloc1255)
        R.vm.kill_object(alloc1256)
        R.vm.kill_object(alloc1257)
        alloc1259: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1257: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), alloc1258, alloc1259)
        R.vm.kill_object(alloc1258)
        lv110: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1259, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1259)
        model_decoder_layers_6_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[638]
        model_decoder_layers_6_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[639]
        alloc1260: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv110, model_decoder_layers_6_self_attn_out_proj_weight5, model_decoder_layers_6_self_attn_out_proj_bias5, alloc1253, alloc1260)
        R.vm.kill_object(alloc1253)
        R.vm.kill_object(lv110)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias5)
        model_decoder_layers_6_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[649]
        model_decoder_layers_6_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[650]
        alloc1261: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1260, model_decoder_layers_6_encoder_attn_layer_norm_weight5, model_decoder_layers_6_encoder_attn_layer_norm_bias5, alloc1261)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias5)
        model_decoder_layers_6_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[645]
        model_decoder_layers_6_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[646]
        alloc1262: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1261, model_decoder_layers_6_encoder_attn_q_proj_weight5, model_decoder_layers_6_encoder_attn_q_proj_bias5, alloc1262)
        R.vm.kill_object(alloc1261)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias5)
        lv113: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1262, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1262)
        alloc1263: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), lv113, alloc1263)
        R.vm.kill_object(lv113)
        lv114: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1263, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1263)
        model_decoder_layers_6_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[647]
        model_decoder_layers_6_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[648]
        alloc1264: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv114, model_decoder_layers_6_encoder_attn_out_proj_weight5, model_decoder_layers_6_encoder_attn_out_proj_bias5, alloc1260, alloc1264)
        R.vm.kill_object(alloc1260)
        R.vm.kill_object(lv114)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias5)
        model_decoder_layers_6_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[655]
        model_decoder_layers_6_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[656]
        alloc1265: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1264, model_decoder_layers_6_final_layer_norm_weight5, model_decoder_layers_6_final_layer_norm_bias5, alloc1265)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias5)
        model_decoder_layers_6_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[651]
        model_decoder_layers_6_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[652]
        alloc1266: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1265, model_decoder_layers_6_fc1_weight5, model_decoder_layers_6_fc1_bias5, alloc1266)
        R.vm.kill_object(alloc1265)
        R.vm.kill_object(model_decoder_layers_6_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_6_fc1_bias5)
        model_decoder_layers_6_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[653]
        model_decoder_layers_6_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[654]
        alloc1267: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1266, model_decoder_layers_6_fc2_weight5, model_decoder_layers_6_fc2_bias5, alloc1264, alloc1267)
        R.vm.kill_object(alloc1264)
        R.vm.kill_object(alloc1266)
        R.vm.kill_object(model_decoder_layers_6_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_6_fc2_bias5)
        model_decoder_layers_7_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[664]
        model_decoder_layers_7_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[665]
        alloc1268: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1267, model_decoder_layers_7_self_attn_layer_norm_weight5, model_decoder_layers_7_self_attn_layer_norm_bias5, alloc1268)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias5)
        model_decoder_layers_7_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[660]
        model_decoder_layers_7_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[661]
        alloc1269: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_q_proj_weight5, model_decoder_layers_7_self_attn_q_proj_bias5, alloc1269)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias5)
        model_decoder_layers_7_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[657]
        alloc1270: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1268, model_decoder_layers_7_self_attn_k_proj_weight5, alloc1270)
        R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight5)
        model_decoder_layers_7_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[658]
        model_decoder_layers_7_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[659]
        alloc1271: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_v_proj_weight5, model_decoder_layers_7_self_attn_v_proj_bias5, alloc1271)
        R.vm.kill_object(alloc1268)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias5)
        alloc1272: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1269, alloc1270, alloc1271, alloc1272)
        R.vm.kill_object(alloc1269)
        R.vm.kill_object(alloc1270)
        R.vm.kill_object(alloc1271)
        alloc1273: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1271: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), alloc1272, alloc1273)
        R.vm.kill_object(alloc1272)
        lv121: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1273, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1273)
        model_decoder_layers_7_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[662]
        model_decoder_layers_7_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[663]
        alloc1274: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv121, model_decoder_layers_7_self_attn_out_proj_weight5, model_decoder_layers_7_self_attn_out_proj_bias5, alloc1267, alloc1274)
        R.vm.kill_object(alloc1267)
        R.vm.kill_object(lv121)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias5)
        model_decoder_layers_7_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[673]
        model_decoder_layers_7_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[674]
        alloc1275: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1274, model_decoder_layers_7_encoder_attn_layer_norm_weight5, model_decoder_layers_7_encoder_attn_layer_norm_bias5, alloc1275)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias5)
        model_decoder_layers_7_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[669]
        model_decoder_layers_7_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[670]
        alloc1276: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1275, model_decoder_layers_7_encoder_attn_q_proj_weight5, model_decoder_layers_7_encoder_attn_q_proj_bias5, alloc1276)
        R.vm.kill_object(alloc1275)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias5)
        lv124: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1276, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1276)
        alloc1277: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1275: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), lv124, alloc1277)
        R.vm.kill_object(lv124)
        lv125: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1277, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1277)
        model_decoder_layers_7_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[671]
        model_decoder_layers_7_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[672]
        alloc1278: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv125, model_decoder_layers_7_encoder_attn_out_proj_weight5, model_decoder_layers_7_encoder_attn_out_proj_bias5, alloc1274, alloc1278)
        R.vm.kill_object(alloc1274)
        R.vm.kill_object(lv125)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias5)
        model_decoder_layers_7_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[679]
        model_decoder_layers_7_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[680]
        alloc1279: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1278, model_decoder_layers_7_final_layer_norm_weight5, model_decoder_layers_7_final_layer_norm_bias5, alloc1279)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias5)
        model_decoder_layers_7_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[675]
        model_decoder_layers_7_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[676]
        alloc1280: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1279, model_decoder_layers_7_fc1_weight5, model_decoder_layers_7_fc1_bias5, alloc1280)
        R.vm.kill_object(alloc1279)
        R.vm.kill_object(model_decoder_layers_7_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_7_fc1_bias5)
        model_decoder_layers_7_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[677]
        model_decoder_layers_7_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[678]
        alloc1281: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1280, model_decoder_layers_7_fc2_weight5, model_decoder_layers_7_fc2_bias5, alloc1278, alloc1281)
        R.vm.kill_object(alloc1278)
        R.vm.kill_object(alloc1280)
        R.vm.kill_object(model_decoder_layers_7_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_7_fc2_bias5)
        model_decoder_layers_8_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[688]
        model_decoder_layers_8_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[689]
        alloc1282: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1281, model_decoder_layers_8_self_attn_layer_norm_weight5, model_decoder_layers_8_self_attn_layer_norm_bias5, alloc1282)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias5)
        model_decoder_layers_8_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[684]
        model_decoder_layers_8_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[685]
        alloc1283: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_q_proj_weight5, model_decoder_layers_8_self_attn_q_proj_bias5, alloc1283)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias5)
        model_decoder_layers_8_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[681]
        alloc1284: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1282, model_decoder_layers_8_self_attn_k_proj_weight5, alloc1284)
        R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight5)
        model_decoder_layers_8_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[682]
        model_decoder_layers_8_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[683]
        alloc1285: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_v_proj_weight5, model_decoder_layers_8_self_attn_v_proj_bias5, alloc1285)
        R.vm.kill_object(alloc1282)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias5)
        alloc1286: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1283, alloc1284, alloc1285, alloc1286)
        R.vm.kill_object(alloc1283)
        R.vm.kill_object(alloc1284)
        R.vm.kill_object(alloc1285)
        alloc1287: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1285: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), alloc1286, alloc1287)
        R.vm.kill_object(alloc1286)
        lv132: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1287, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1287)
        model_decoder_layers_8_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[686]
        model_decoder_layers_8_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[687]
        alloc1288: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv132, model_decoder_layers_8_self_attn_out_proj_weight5, model_decoder_layers_8_self_attn_out_proj_bias5, alloc1281, alloc1288)
        R.vm.kill_object(alloc1281)
        R.vm.kill_object(lv132)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias5)
        model_decoder_layers_8_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[697]
        model_decoder_layers_8_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[698]
        alloc1289: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1288, model_decoder_layers_8_encoder_attn_layer_norm_weight5, model_decoder_layers_8_encoder_attn_layer_norm_bias5, alloc1289)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias5)
        model_decoder_layers_8_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[693]
        model_decoder_layers_8_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[694]
        alloc1290: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1289, model_decoder_layers_8_encoder_attn_q_proj_weight5, model_decoder_layers_8_encoder_attn_q_proj_bias5, alloc1290)
        R.vm.kill_object(alloc1289)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias5)
        lv135: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1290, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1290)
        alloc1291: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1289: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), lv135, alloc1291)
        R.vm.kill_object(lv135)
        lv136: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1291, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1291)
        model_decoder_layers_8_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[695]
        model_decoder_layers_8_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[696]
        alloc1292: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv136, model_decoder_layers_8_encoder_attn_out_proj_weight5, model_decoder_layers_8_encoder_attn_out_proj_bias5, alloc1288, alloc1292)
        R.vm.kill_object(alloc1288)
        R.vm.kill_object(lv136)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias5)
        model_decoder_layers_8_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[703]
        model_decoder_layers_8_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[704]
        alloc1293: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1292, model_decoder_layers_8_final_layer_norm_weight5, model_decoder_layers_8_final_layer_norm_bias5, alloc1293)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias5)
        model_decoder_layers_8_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[699]
        model_decoder_layers_8_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[700]
        alloc1294: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1293, model_decoder_layers_8_fc1_weight5, model_decoder_layers_8_fc1_bias5, alloc1294)
        R.vm.kill_object(alloc1293)
        R.vm.kill_object(model_decoder_layers_8_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_8_fc1_bias5)
        model_decoder_layers_8_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[701]
        model_decoder_layers_8_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[702]
        alloc1295: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1294, model_decoder_layers_8_fc2_weight5, model_decoder_layers_8_fc2_bias5, alloc1292, alloc1295)
        R.vm.kill_object(alloc1292)
        R.vm.kill_object(alloc1294)
        R.vm.kill_object(model_decoder_layers_8_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_8_fc2_bias5)
        model_decoder_layers_9_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[712]
        model_decoder_layers_9_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[713]
        alloc1296: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1295, model_decoder_layers_9_self_attn_layer_norm_weight5, model_decoder_layers_9_self_attn_layer_norm_bias5, alloc1296)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias5)
        model_decoder_layers_9_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[708]
        model_decoder_layers_9_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[709]
        alloc1297: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_q_proj_weight5, model_decoder_layers_9_self_attn_q_proj_bias5, alloc1297)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias5)
        model_decoder_layers_9_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[705]
        alloc1298: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1296, model_decoder_layers_9_self_attn_k_proj_weight5, alloc1298)
        R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight5)
        model_decoder_layers_9_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[706]
        model_decoder_layers_9_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[707]
        alloc1299: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_v_proj_weight5, model_decoder_layers_9_self_attn_v_proj_bias5, alloc1299)
        R.vm.kill_object(alloc1296)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias5)
        alloc1300: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1297, alloc1298, alloc1299, alloc1300)
        R.vm.kill_object(alloc1297)
        R.vm.kill_object(alloc1298)
        R.vm.kill_object(alloc1299)
        alloc1301: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), alloc1300, alloc1301)
        R.vm.kill_object(alloc1300)
        lv143: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1301, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1301)
        model_decoder_layers_9_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[710]
        model_decoder_layers_9_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[711]
        alloc1302: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv143, model_decoder_layers_9_self_attn_out_proj_weight5, model_decoder_layers_9_self_attn_out_proj_bias5, alloc1295, alloc1302)
        R.vm.kill_object(alloc1295)
        R.vm.kill_object(lv143)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias5)
        model_decoder_layers_9_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[721]
        model_decoder_layers_9_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[722]
        alloc1303: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1302, model_decoder_layers_9_encoder_attn_layer_norm_weight5, model_decoder_layers_9_encoder_attn_layer_norm_bias5, alloc1303)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias5)
        model_decoder_layers_9_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[717]
        model_decoder_layers_9_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[718]
        alloc1304: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1303, model_decoder_layers_9_encoder_attn_q_proj_weight5, model_decoder_layers_9_encoder_attn_q_proj_bias5, alloc1304)
        R.vm.kill_object(alloc1303)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias5)
        lv146: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1304, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1304)
        alloc1305: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1303: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), lv146, alloc1305)
        R.vm.kill_object(lv146)
        lv147: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1305, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1305)
        model_decoder_layers_9_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[719]
        model_decoder_layers_9_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[720]
        alloc1306: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv147, model_decoder_layers_9_encoder_attn_out_proj_weight5, model_decoder_layers_9_encoder_attn_out_proj_bias5, alloc1302, alloc1306)
        R.vm.kill_object(alloc1302)
        R.vm.kill_object(lv147)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias5)
        model_decoder_layers_9_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[727]
        model_decoder_layers_9_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[728]
        alloc1307: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1306, model_decoder_layers_9_final_layer_norm_weight5, model_decoder_layers_9_final_layer_norm_bias5, alloc1307)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias5)
        model_decoder_layers_9_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[723]
        model_decoder_layers_9_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[724]
        alloc1308: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1307, model_decoder_layers_9_fc1_weight5, model_decoder_layers_9_fc1_bias5, alloc1308)
        R.vm.kill_object(alloc1307)
        R.vm.kill_object(model_decoder_layers_9_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_9_fc1_bias5)
        model_decoder_layers_9_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[725]
        model_decoder_layers_9_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[726]
        alloc1309: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1308, model_decoder_layers_9_fc2_weight5, model_decoder_layers_9_fc2_bias5, alloc1306, alloc1309)
        R.vm.kill_object(alloc1306)
        R.vm.kill_object(alloc1308)
        R.vm.kill_object(model_decoder_layers_9_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_9_fc2_bias5)
        model_decoder_layers_10_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[736]
        model_decoder_layers_10_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[737]
        alloc1310: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1309, model_decoder_layers_10_self_attn_layer_norm_weight5, model_decoder_layers_10_self_attn_layer_norm_bias5, alloc1310)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias5)
        model_decoder_layers_10_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[732]
        model_decoder_layers_10_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[733]
        alloc1311: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_q_proj_weight5, model_decoder_layers_10_self_attn_q_proj_bias5, alloc1311)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias5)
        model_decoder_layers_10_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[729]
        alloc1312: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1310, model_decoder_layers_10_self_attn_k_proj_weight5, alloc1312)
        R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight5)
        model_decoder_layers_10_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[730]
        model_decoder_layers_10_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[731]
        alloc1313: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_v_proj_weight5, model_decoder_layers_10_self_attn_v_proj_bias5, alloc1313)
        R.vm.kill_object(alloc1310)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias5)
        alloc1314: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1311, alloc1312, alloc1313, alloc1314)
        R.vm.kill_object(alloc1311)
        R.vm.kill_object(alloc1312)
        R.vm.kill_object(alloc1313)
        alloc1315: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1313: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), alloc1314, alloc1315)
        R.vm.kill_object(alloc1314)
        lv154: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1315, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1315)
        model_decoder_layers_10_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[734]
        model_decoder_layers_10_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[735]
        alloc1316: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv154, model_decoder_layers_10_self_attn_out_proj_weight5, model_decoder_layers_10_self_attn_out_proj_bias5, alloc1309, alloc1316)
        R.vm.kill_object(alloc1309)
        R.vm.kill_object(lv154)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias5)
        model_decoder_layers_10_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[745]
        model_decoder_layers_10_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[746]
        alloc1317: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1316, model_decoder_layers_10_encoder_attn_layer_norm_weight5, model_decoder_layers_10_encoder_attn_layer_norm_bias5, alloc1317)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias5)
        model_decoder_layers_10_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[741]
        model_decoder_layers_10_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[742]
        alloc1318: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1317, model_decoder_layers_10_encoder_attn_q_proj_weight5, model_decoder_layers_10_encoder_attn_q_proj_bias5, alloc1318)
        R.vm.kill_object(alloc1317)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias5)
        lv157: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1318, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1318)
        alloc1319: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), lv157, alloc1319)
        R.vm.kill_object(lv157)
        lv158: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1319, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1319)
        model_decoder_layers_10_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[743]
        model_decoder_layers_10_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[744]
        alloc1320: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv158, model_decoder_layers_10_encoder_attn_out_proj_weight5, model_decoder_layers_10_encoder_attn_out_proj_bias5, alloc1316, alloc1320)
        R.vm.kill_object(alloc1316)
        R.vm.kill_object(lv158)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias5)
        model_decoder_layers_10_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[751]
        model_decoder_layers_10_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[752]
        alloc1321: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1320, model_decoder_layers_10_final_layer_norm_weight5, model_decoder_layers_10_final_layer_norm_bias5, alloc1321)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias5)
        model_decoder_layers_10_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[747]
        model_decoder_layers_10_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[748]
        alloc1322: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1321, model_decoder_layers_10_fc1_weight5, model_decoder_layers_10_fc1_bias5, alloc1322)
        R.vm.kill_object(alloc1321)
        R.vm.kill_object(model_decoder_layers_10_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_10_fc1_bias5)
        model_decoder_layers_10_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[749]
        model_decoder_layers_10_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[750]
        alloc1323: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1322, model_decoder_layers_10_fc2_weight5, model_decoder_layers_10_fc2_bias5, alloc1320, alloc1323)
        R.vm.kill_object(alloc1320)
        R.vm.kill_object(alloc1322)
        R.vm.kill_object(model_decoder_layers_10_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_10_fc2_bias5)
        model_decoder_layers_11_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[760]
        model_decoder_layers_11_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[761]
        alloc1324: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1323, model_decoder_layers_11_self_attn_layer_norm_weight5, model_decoder_layers_11_self_attn_layer_norm_bias5, alloc1324)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias5)
        model_decoder_layers_11_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[756]
        model_decoder_layers_11_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[757]
        alloc1325: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_q_proj_weight5, model_decoder_layers_11_self_attn_q_proj_bias5, alloc1325)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias5)
        model_decoder_layers_11_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[753]
        alloc1326: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1324, model_decoder_layers_11_self_attn_k_proj_weight5, alloc1326)
        R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight5)
        model_decoder_layers_11_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[754]
        model_decoder_layers_11_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[755]
        alloc1327: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_v_proj_weight5, model_decoder_layers_11_self_attn_v_proj_bias5, alloc1327)
        R.vm.kill_object(alloc1324)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias5)
        alloc1328: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1325, alloc1326, alloc1327, alloc1328)
        R.vm.kill_object(alloc1325)
        R.vm.kill_object(alloc1326)
        R.vm.kill_object(alloc1327)
        alloc1329: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1327: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), alloc1328, alloc1329)
        R.vm.kill_object(alloc1328)
        lv165: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1329, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1329)
        model_decoder_layers_11_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[758]
        model_decoder_layers_11_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[759]
        alloc1330: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv165, model_decoder_layers_11_self_attn_out_proj_weight5, model_decoder_layers_11_self_attn_out_proj_bias5, alloc1323, alloc1330)
        R.vm.kill_object(alloc1323)
        R.vm.kill_object(lv165)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias5)
        model_decoder_layers_11_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[769]
        model_decoder_layers_11_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[770]
        alloc1331: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1330, model_decoder_layers_11_encoder_attn_layer_norm_weight5, model_decoder_layers_11_encoder_attn_layer_norm_bias5, alloc1331)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias5)
        model_decoder_layers_11_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[765]
        model_decoder_layers_11_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[766]
        alloc1332: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1331, model_decoder_layers_11_encoder_attn_q_proj_weight5, model_decoder_layers_11_encoder_attn_q_proj_bias5, alloc1332)
        R.vm.kill_object(alloc1331)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias5)
        lv168: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1332, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1332)
        alloc1333: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1331: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), lv168, alloc1333)
        R.vm.kill_object(lv168)
        lv169: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1333, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1333)
        model_decoder_layers_11_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[767]
        model_decoder_layers_11_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[768]
        alloc1334: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv169, model_decoder_layers_11_encoder_attn_out_proj_weight5, model_decoder_layers_11_encoder_attn_out_proj_bias5, alloc1330, alloc1334)
        R.vm.kill_object(alloc1330)
        R.vm.kill_object(lv169)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias5)
        model_decoder_layers_11_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[775]
        model_decoder_layers_11_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[776]
        alloc1335: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1334, model_decoder_layers_11_final_layer_norm_weight5, model_decoder_layers_11_final_layer_norm_bias5, alloc1335)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias5)
        model_decoder_layers_11_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[771]
        model_decoder_layers_11_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[772]
        alloc1336: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1335, model_decoder_layers_11_fc1_weight5, model_decoder_layers_11_fc1_bias5, alloc1336)
        R.vm.kill_object(alloc1335)
        R.vm.kill_object(model_decoder_layers_11_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_11_fc1_bias5)
        model_decoder_layers_11_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[773]
        model_decoder_layers_11_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[774]
        alloc1337: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1336, model_decoder_layers_11_fc2_weight5, model_decoder_layers_11_fc2_bias5, alloc1334, alloc1337)
        R.vm.kill_object(alloc1334)
        R.vm.kill_object(alloc1336)
        R.vm.kill_object(model_decoder_layers_11_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_11_fc2_bias5)
        model_decoder_layers_12_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[784]
        model_decoder_layers_12_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[785]
        alloc1338: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1337, model_decoder_layers_12_self_attn_layer_norm_weight5, model_decoder_layers_12_self_attn_layer_norm_bias5, alloc1338)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias5)
        model_decoder_layers_12_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[780]
        model_decoder_layers_12_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[781]
        alloc1339: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_q_proj_weight5, model_decoder_layers_12_self_attn_q_proj_bias5, alloc1339)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias5)
        model_decoder_layers_12_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[777]
        alloc1340: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1338, model_decoder_layers_12_self_attn_k_proj_weight5, alloc1340)
        R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight5)
        model_decoder_layers_12_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[778]
        model_decoder_layers_12_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[779]
        alloc1341: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_v_proj_weight5, model_decoder_layers_12_self_attn_v_proj_bias5, alloc1341)
        R.vm.kill_object(alloc1338)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias5)
        alloc1342: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1339, alloc1340, alloc1341, alloc1342)
        R.vm.kill_object(alloc1339)
        R.vm.kill_object(alloc1340)
        R.vm.kill_object(alloc1341)
        alloc1343: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1341: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), alloc1342, alloc1343)
        R.vm.kill_object(alloc1342)
        lv176: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1343, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1343)
        model_decoder_layers_12_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[782]
        model_decoder_layers_12_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[783]
        alloc1344: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv176, model_decoder_layers_12_self_attn_out_proj_weight5, model_decoder_layers_12_self_attn_out_proj_bias5, alloc1337, alloc1344)
        R.vm.kill_object(alloc1337)
        R.vm.kill_object(lv176)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias5)
        model_decoder_layers_12_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[793]
        model_decoder_layers_12_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[794]
        alloc1345: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1344, model_decoder_layers_12_encoder_attn_layer_norm_weight5, model_decoder_layers_12_encoder_attn_layer_norm_bias5, alloc1345)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias5)
        model_decoder_layers_12_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[789]
        model_decoder_layers_12_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[790]
        alloc1346: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1345, model_decoder_layers_12_encoder_attn_q_proj_weight5, model_decoder_layers_12_encoder_attn_q_proj_bias5, alloc1346)
        R.vm.kill_object(alloc1345)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias5)
        lv179: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1346, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1346)
        alloc1347: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1345: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), lv179, alloc1347)
        R.vm.kill_object(lv179)
        lv180: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1347, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1347)
        model_decoder_layers_12_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[791]
        model_decoder_layers_12_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[792]
        alloc1348: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv180, model_decoder_layers_12_encoder_attn_out_proj_weight5, model_decoder_layers_12_encoder_attn_out_proj_bias5, alloc1344, alloc1348)
        R.vm.kill_object(alloc1344)
        R.vm.kill_object(lv180)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias5)
        model_decoder_layers_12_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[799]
        model_decoder_layers_12_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[800]
        alloc1349: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1348, model_decoder_layers_12_final_layer_norm_weight5, model_decoder_layers_12_final_layer_norm_bias5, alloc1349)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias5)
        model_decoder_layers_12_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[795]
        model_decoder_layers_12_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[796]
        alloc1350: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1349, model_decoder_layers_12_fc1_weight5, model_decoder_layers_12_fc1_bias5, alloc1350)
        R.vm.kill_object(alloc1349)
        R.vm.kill_object(model_decoder_layers_12_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_12_fc1_bias5)
        model_decoder_layers_12_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[797]
        model_decoder_layers_12_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[798]
        alloc1351: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1350, model_decoder_layers_12_fc2_weight5, model_decoder_layers_12_fc2_bias5, alloc1348, alloc1351)
        R.vm.kill_object(alloc1348)
        R.vm.kill_object(alloc1350)
        R.vm.kill_object(model_decoder_layers_12_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_12_fc2_bias5)
        model_decoder_layers_13_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[808]
        model_decoder_layers_13_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[809]
        alloc1352: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1351, model_decoder_layers_13_self_attn_layer_norm_weight5, model_decoder_layers_13_self_attn_layer_norm_bias5, alloc1352)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias5)
        model_decoder_layers_13_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[804]
        model_decoder_layers_13_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[805]
        alloc1353: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_q_proj_weight5, model_decoder_layers_13_self_attn_q_proj_bias5, alloc1353)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias5)
        model_decoder_layers_13_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[801]
        alloc1354: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1352, model_decoder_layers_13_self_attn_k_proj_weight5, alloc1354)
        R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight5)
        model_decoder_layers_13_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[802]
        model_decoder_layers_13_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[803]
        alloc1355: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_v_proj_weight5, model_decoder_layers_13_self_attn_v_proj_bias5, alloc1355)
        R.vm.kill_object(alloc1352)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias5)
        alloc1356: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1353, alloc1354, alloc1355, alloc1356)
        R.vm.kill_object(alloc1353)
        R.vm.kill_object(alloc1354)
        R.vm.kill_object(alloc1355)
        alloc1357: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), alloc1356, alloc1357)
        R.vm.kill_object(alloc1356)
        lv187: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1357, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1357)
        model_decoder_layers_13_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[806]
        model_decoder_layers_13_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[807]
        alloc1358: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv187, model_decoder_layers_13_self_attn_out_proj_weight5, model_decoder_layers_13_self_attn_out_proj_bias5, alloc1351, alloc1358)
        R.vm.kill_object(alloc1351)
        R.vm.kill_object(lv187)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias5)
        model_decoder_layers_13_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[817]
        model_decoder_layers_13_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[818]
        alloc1359: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1358, model_decoder_layers_13_encoder_attn_layer_norm_weight5, model_decoder_layers_13_encoder_attn_layer_norm_bias5, alloc1359)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias5)
        model_decoder_layers_13_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[813]
        model_decoder_layers_13_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[814]
        alloc1360: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1359, model_decoder_layers_13_encoder_attn_q_proj_weight5, model_decoder_layers_13_encoder_attn_q_proj_bias5, alloc1360)
        R.vm.kill_object(alloc1359)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias5)
        lv190: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1360, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1360)
        alloc1361: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1359: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), lv190, alloc1361)
        R.vm.kill_object(lv190)
        lv191: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1361, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1361)
        model_decoder_layers_13_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[815]
        model_decoder_layers_13_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[816]
        alloc1362: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv191, model_decoder_layers_13_encoder_attn_out_proj_weight5, model_decoder_layers_13_encoder_attn_out_proj_bias5, alloc1358, alloc1362)
        R.vm.kill_object(alloc1358)
        R.vm.kill_object(lv191)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias5)
        model_decoder_layers_13_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[823]
        model_decoder_layers_13_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[824]
        alloc1363: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1362, model_decoder_layers_13_final_layer_norm_weight5, model_decoder_layers_13_final_layer_norm_bias5, alloc1363)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias5)
        model_decoder_layers_13_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[819]
        model_decoder_layers_13_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[820]
        alloc1364: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1363, model_decoder_layers_13_fc1_weight5, model_decoder_layers_13_fc1_bias5, alloc1364)
        R.vm.kill_object(alloc1363)
        R.vm.kill_object(model_decoder_layers_13_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_13_fc1_bias5)
        model_decoder_layers_13_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[821]
        model_decoder_layers_13_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[822]
        alloc1365: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1364, model_decoder_layers_13_fc2_weight5, model_decoder_layers_13_fc2_bias5, alloc1362, alloc1365)
        R.vm.kill_object(alloc1362)
        R.vm.kill_object(alloc1364)
        R.vm.kill_object(model_decoder_layers_13_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_13_fc2_bias5)
        model_decoder_layers_14_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[832]
        model_decoder_layers_14_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[833]
        alloc1366: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1365, model_decoder_layers_14_self_attn_layer_norm_weight5, model_decoder_layers_14_self_attn_layer_norm_bias5, alloc1366)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias5)
        model_decoder_layers_14_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[828]
        model_decoder_layers_14_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[829]
        alloc1367: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_q_proj_weight5, model_decoder_layers_14_self_attn_q_proj_bias5, alloc1367)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias5)
        model_decoder_layers_14_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[825]
        alloc1368: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1366, model_decoder_layers_14_self_attn_k_proj_weight5, alloc1368)
        R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight5)
        model_decoder_layers_14_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[826]
        model_decoder_layers_14_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[827]
        alloc1369: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_v_proj_weight5, model_decoder_layers_14_self_attn_v_proj_bias5, alloc1369)
        R.vm.kill_object(alloc1366)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias5)
        alloc1370: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1367, alloc1368, alloc1369, alloc1370)
        R.vm.kill_object(alloc1367)
        R.vm.kill_object(alloc1368)
        R.vm.kill_object(alloc1369)
        alloc1371: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1369: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), alloc1370, alloc1371)
        R.vm.kill_object(alloc1370)
        lv198: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1371, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1371)
        model_decoder_layers_14_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[830]
        model_decoder_layers_14_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[831]
        alloc1372: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv198, model_decoder_layers_14_self_attn_out_proj_weight5, model_decoder_layers_14_self_attn_out_proj_bias5, alloc1365, alloc1372)
        R.vm.kill_object(alloc1365)
        R.vm.kill_object(lv198)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias5)
        model_decoder_layers_14_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[841]
        model_decoder_layers_14_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[842]
        alloc1373: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1372, model_decoder_layers_14_encoder_attn_layer_norm_weight5, model_decoder_layers_14_encoder_attn_layer_norm_bias5, alloc1373)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias5)
        model_decoder_layers_14_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[837]
        model_decoder_layers_14_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[838]
        alloc1374: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1373, model_decoder_layers_14_encoder_attn_q_proj_weight5, model_decoder_layers_14_encoder_attn_q_proj_bias5, alloc1374)
        R.vm.kill_object(alloc1373)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias5)
        lv201: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1374, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1374)
        alloc1375: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1373: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), lv201, alloc1375)
        R.vm.kill_object(lv201)
        lv202: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1375, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1375)
        model_decoder_layers_14_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[839]
        model_decoder_layers_14_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[840]
        alloc1376: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv202, model_decoder_layers_14_encoder_attn_out_proj_weight5, model_decoder_layers_14_encoder_attn_out_proj_bias5, alloc1372, alloc1376)
        R.vm.kill_object(alloc1372)
        R.vm.kill_object(lv202)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias5)
        model_decoder_layers_14_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[847]
        model_decoder_layers_14_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[848]
        alloc1377: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1376, model_decoder_layers_14_final_layer_norm_weight5, model_decoder_layers_14_final_layer_norm_bias5, alloc1377)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias5)
        model_decoder_layers_14_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[843]
        model_decoder_layers_14_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[844]
        alloc1378: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1377, model_decoder_layers_14_fc1_weight5, model_decoder_layers_14_fc1_bias5, alloc1378)
        R.vm.kill_object(alloc1377)
        R.vm.kill_object(model_decoder_layers_14_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_14_fc1_bias5)
        model_decoder_layers_14_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[845]
        model_decoder_layers_14_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[846]
        alloc1379: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1378, model_decoder_layers_14_fc2_weight5, model_decoder_layers_14_fc2_bias5, alloc1376, alloc1379)
        R.vm.kill_object(alloc1376)
        R.vm.kill_object(alloc1378)
        R.vm.kill_object(model_decoder_layers_14_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_14_fc2_bias5)
        model_decoder_layers_15_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[856]
        model_decoder_layers_15_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[857]
        alloc1380: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1379, model_decoder_layers_15_self_attn_layer_norm_weight5, model_decoder_layers_15_self_attn_layer_norm_bias5, alloc1380)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias5)
        model_decoder_layers_15_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[852]
        model_decoder_layers_15_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[853]
        alloc1381: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_q_proj_weight5, model_decoder_layers_15_self_attn_q_proj_bias5, alloc1381)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias5)
        model_decoder_layers_15_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[849]
        alloc1382: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1380, model_decoder_layers_15_self_attn_k_proj_weight5, alloc1382)
        R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight5)
        model_decoder_layers_15_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[850]
        model_decoder_layers_15_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[851]
        alloc1383: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_v_proj_weight5, model_decoder_layers_15_self_attn_v_proj_bias5, alloc1383)
        R.vm.kill_object(alloc1380)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias5)
        alloc1384: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1381, alloc1382, alloc1383, alloc1384)
        R.vm.kill_object(alloc1381)
        R.vm.kill_object(alloc1382)
        R.vm.kill_object(alloc1383)
        alloc1385: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1383: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), alloc1384, alloc1385)
        R.vm.kill_object(alloc1384)
        lv209: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1385, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1385)
        model_decoder_layers_15_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[854]
        model_decoder_layers_15_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[855]
        alloc1386: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv209, model_decoder_layers_15_self_attn_out_proj_weight5, model_decoder_layers_15_self_attn_out_proj_bias5, alloc1379, alloc1386)
        R.vm.kill_object(alloc1379)
        R.vm.kill_object(lv209)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias5)
        model_decoder_layers_15_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[865]
        model_decoder_layers_15_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[866]
        alloc1387: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1386, model_decoder_layers_15_encoder_attn_layer_norm_weight5, model_decoder_layers_15_encoder_attn_layer_norm_bias5, alloc1387)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias5)
        model_decoder_layers_15_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[861]
        model_decoder_layers_15_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[862]
        alloc1388: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1387, model_decoder_layers_15_encoder_attn_q_proj_weight5, model_decoder_layers_15_encoder_attn_q_proj_bias5, alloc1388)
        R.vm.kill_object(alloc1387)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias5)
        lv212: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1388, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1388)
        alloc1389: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1387: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), lv212, alloc1389)
        R.vm.kill_object(lv212)
        lv213: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1389, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1389)
        model_decoder_layers_15_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[863]
        model_decoder_layers_15_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[864]
        alloc1390: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv213, model_decoder_layers_15_encoder_attn_out_proj_weight5, model_decoder_layers_15_encoder_attn_out_proj_bias5, alloc1386, alloc1390)
        R.vm.kill_object(alloc1386)
        R.vm.kill_object(lv213)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias5)
        model_decoder_layers_15_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[871]
        model_decoder_layers_15_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[872]
        alloc1391: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1390, model_decoder_layers_15_final_layer_norm_weight5, model_decoder_layers_15_final_layer_norm_bias5, alloc1391)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias5)
        model_decoder_layers_15_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[867]
        model_decoder_layers_15_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[868]
        alloc1392: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1391, model_decoder_layers_15_fc1_weight5, model_decoder_layers_15_fc1_bias5, alloc1392)
        R.vm.kill_object(alloc1391)
        R.vm.kill_object(model_decoder_layers_15_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_15_fc1_bias5)
        model_decoder_layers_15_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[869]
        model_decoder_layers_15_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[870]
        alloc1393: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1392, model_decoder_layers_15_fc2_weight5, model_decoder_layers_15_fc2_bias5, alloc1390, alloc1393)
        R.vm.kill_object(alloc1390)
        R.vm.kill_object(alloc1392)
        R.vm.kill_object(model_decoder_layers_15_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_15_fc2_bias5)
        model_decoder_layers_16_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[880]
        model_decoder_layers_16_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[881]
        alloc1394: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1393, model_decoder_layers_16_self_attn_layer_norm_weight5, model_decoder_layers_16_self_attn_layer_norm_bias5, alloc1394)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias5)
        model_decoder_layers_16_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[876]
        model_decoder_layers_16_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[877]
        alloc1395: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_q_proj_weight5, model_decoder_layers_16_self_attn_q_proj_bias5, alloc1395)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias5)
        model_decoder_layers_16_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[873]
        alloc1396: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1394, model_decoder_layers_16_self_attn_k_proj_weight5, alloc1396)
        R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight5)
        model_decoder_layers_16_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[874]
        model_decoder_layers_16_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[875]
        alloc1397: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_v_proj_weight5, model_decoder_layers_16_self_attn_v_proj_bias5, alloc1397)
        R.vm.kill_object(alloc1394)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias5)
        alloc1398: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1395, alloc1396, alloc1397, alloc1398)
        R.vm.kill_object(alloc1395)
        R.vm.kill_object(alloc1396)
        R.vm.kill_object(alloc1397)
        alloc1399: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), alloc1398, alloc1399)
        R.vm.kill_object(alloc1398)
        lv220: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1399, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1399)
        model_decoder_layers_16_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[878]
        model_decoder_layers_16_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[879]
        alloc1400: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv220, model_decoder_layers_16_self_attn_out_proj_weight5, model_decoder_layers_16_self_attn_out_proj_bias5, alloc1393, alloc1400)
        R.vm.kill_object(alloc1393)
        R.vm.kill_object(lv220)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias5)
        model_decoder_layers_16_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[889]
        model_decoder_layers_16_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[890]
        alloc1401: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1400, model_decoder_layers_16_encoder_attn_layer_norm_weight5, model_decoder_layers_16_encoder_attn_layer_norm_bias5, alloc1401)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias5)
        model_decoder_layers_16_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[885]
        model_decoder_layers_16_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[886]
        alloc1402: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1401, model_decoder_layers_16_encoder_attn_q_proj_weight5, model_decoder_layers_16_encoder_attn_q_proj_bias5, alloc1402)
        R.vm.kill_object(alloc1401)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias5)
        lv223: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1402, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1402)
        alloc1403: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), lv223, alloc1403)
        R.vm.kill_object(lv223)
        lv224: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1403, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1403)
        model_decoder_layers_16_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[887]
        model_decoder_layers_16_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[888]
        alloc1404: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv224, model_decoder_layers_16_encoder_attn_out_proj_weight5, model_decoder_layers_16_encoder_attn_out_proj_bias5, alloc1400, alloc1404)
        R.vm.kill_object(alloc1400)
        R.vm.kill_object(lv224)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias5)
        model_decoder_layers_16_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[895]
        model_decoder_layers_16_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[896]
        alloc1405: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1404, model_decoder_layers_16_final_layer_norm_weight5, model_decoder_layers_16_final_layer_norm_bias5, alloc1405)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias5)
        model_decoder_layers_16_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[891]
        model_decoder_layers_16_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[892]
        alloc1406: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1405, model_decoder_layers_16_fc1_weight5, model_decoder_layers_16_fc1_bias5, alloc1406)
        R.vm.kill_object(alloc1405)
        R.vm.kill_object(model_decoder_layers_16_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_16_fc1_bias5)
        model_decoder_layers_16_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[893]
        model_decoder_layers_16_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[894]
        alloc1407: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1406, model_decoder_layers_16_fc2_weight5, model_decoder_layers_16_fc2_bias5, alloc1404, alloc1407)
        R.vm.kill_object(alloc1404)
        R.vm.kill_object(alloc1406)
        R.vm.kill_object(model_decoder_layers_16_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_16_fc2_bias5)
        model_decoder_layers_17_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[904]
        model_decoder_layers_17_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[905]
        alloc1408: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1407, model_decoder_layers_17_self_attn_layer_norm_weight5, model_decoder_layers_17_self_attn_layer_norm_bias5, alloc1408)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias5)
        model_decoder_layers_17_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[900]
        model_decoder_layers_17_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[901]
        alloc1409: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_q_proj_weight5, model_decoder_layers_17_self_attn_q_proj_bias5, alloc1409)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias5)
        model_decoder_layers_17_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[897]
        alloc1410: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1408, model_decoder_layers_17_self_attn_k_proj_weight5, alloc1410)
        R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight5)
        model_decoder_layers_17_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[898]
        model_decoder_layers_17_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[899]
        alloc1411: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_v_proj_weight5, model_decoder_layers_17_self_attn_v_proj_bias5, alloc1411)
        R.vm.kill_object(alloc1408)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias5)
        alloc1412: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1409, alloc1410, alloc1411, alloc1412)
        R.vm.kill_object(alloc1409)
        R.vm.kill_object(alloc1410)
        R.vm.kill_object(alloc1411)
        alloc1413: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1411: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), alloc1412, alloc1413)
        R.vm.kill_object(alloc1412)
        lv231: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1413, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1413)
        model_decoder_layers_17_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[902]
        model_decoder_layers_17_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[903]
        alloc1414: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv231, model_decoder_layers_17_self_attn_out_proj_weight5, model_decoder_layers_17_self_attn_out_proj_bias5, alloc1407, alloc1414)
        R.vm.kill_object(alloc1407)
        R.vm.kill_object(lv231)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias5)
        model_decoder_layers_17_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[913]
        model_decoder_layers_17_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[914]
        alloc1415: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1414, model_decoder_layers_17_encoder_attn_layer_norm_weight5, model_decoder_layers_17_encoder_attn_layer_norm_bias5, alloc1415)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias5)
        model_decoder_layers_17_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[909]
        model_decoder_layers_17_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[910]
        alloc1416: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1415, model_decoder_layers_17_encoder_attn_q_proj_weight5, model_decoder_layers_17_encoder_attn_q_proj_bias5, alloc1416)
        R.vm.kill_object(alloc1415)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias5)
        lv234: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1416, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1416)
        alloc1417: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1415: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), lv234, alloc1417)
        R.vm.kill_object(lv234)
        lv235: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1417, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1417)
        model_decoder_layers_17_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[911]
        model_decoder_layers_17_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[912]
        alloc1418: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv235, model_decoder_layers_17_encoder_attn_out_proj_weight5, model_decoder_layers_17_encoder_attn_out_proj_bias5, alloc1414, alloc1418)
        R.vm.kill_object(alloc1414)
        R.vm.kill_object(lv235)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias5)
        model_decoder_layers_17_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[919]
        model_decoder_layers_17_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[920]
        alloc1419: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1418, model_decoder_layers_17_final_layer_norm_weight5, model_decoder_layers_17_final_layer_norm_bias5, alloc1419)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias5)
        model_decoder_layers_17_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[915]
        model_decoder_layers_17_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[916]
        alloc1420: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1419, model_decoder_layers_17_fc1_weight5, model_decoder_layers_17_fc1_bias5, alloc1420)
        R.vm.kill_object(alloc1419)
        R.vm.kill_object(model_decoder_layers_17_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_17_fc1_bias5)
        model_decoder_layers_17_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[917]
        model_decoder_layers_17_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[918]
        alloc1421: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1420, model_decoder_layers_17_fc2_weight5, model_decoder_layers_17_fc2_bias5, alloc1418, alloc1421)
        R.vm.kill_object(alloc1418)
        R.vm.kill_object(alloc1420)
        R.vm.kill_object(model_decoder_layers_17_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_17_fc2_bias5)
        model_decoder_layers_18_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[928]
        model_decoder_layers_18_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[929]
        alloc1422: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1421, model_decoder_layers_18_self_attn_layer_norm_weight5, model_decoder_layers_18_self_attn_layer_norm_bias5, alloc1422)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias5)
        model_decoder_layers_18_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[924]
        model_decoder_layers_18_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[925]
        alloc1423: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_q_proj_weight5, model_decoder_layers_18_self_attn_q_proj_bias5, alloc1423)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias5)
        model_decoder_layers_18_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[921]
        alloc1424: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1422, model_decoder_layers_18_self_attn_k_proj_weight5, alloc1424)
        R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight5)
        model_decoder_layers_18_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[922]
        model_decoder_layers_18_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[923]
        alloc1425: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_v_proj_weight5, model_decoder_layers_18_self_attn_v_proj_bias5, alloc1425)
        R.vm.kill_object(alloc1422)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias5)
        alloc1426: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1423, alloc1424, alloc1425, alloc1426)
        R.vm.kill_object(alloc1423)
        R.vm.kill_object(alloc1424)
        R.vm.kill_object(alloc1425)
        alloc1427: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1425: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), alloc1426, alloc1427)
        R.vm.kill_object(alloc1426)
        lv242: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1427, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1427)
        model_decoder_layers_18_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[926]
        model_decoder_layers_18_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[927]
        alloc1428: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv242, model_decoder_layers_18_self_attn_out_proj_weight5, model_decoder_layers_18_self_attn_out_proj_bias5, alloc1421, alloc1428)
        R.vm.kill_object(alloc1421)
        R.vm.kill_object(lv242)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias5)
        model_decoder_layers_18_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[937]
        model_decoder_layers_18_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[938]
        alloc1429: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1428, model_decoder_layers_18_encoder_attn_layer_norm_weight5, model_decoder_layers_18_encoder_attn_layer_norm_bias5, alloc1429)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias5)
        model_decoder_layers_18_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[933]
        model_decoder_layers_18_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[934]
        alloc1430: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1429, model_decoder_layers_18_encoder_attn_q_proj_weight5, model_decoder_layers_18_encoder_attn_q_proj_bias5, alloc1430)
        R.vm.kill_object(alloc1429)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias5)
        lv245: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1430, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1430)
        alloc1431: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1429: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), lv245, alloc1431)
        R.vm.kill_object(lv245)
        lv246: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1431, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1431)
        model_decoder_layers_18_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[935]
        model_decoder_layers_18_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[936]
        alloc1432: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv246, model_decoder_layers_18_encoder_attn_out_proj_weight5, model_decoder_layers_18_encoder_attn_out_proj_bias5, alloc1428, alloc1432)
        R.vm.kill_object(alloc1428)
        R.vm.kill_object(lv246)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias5)
        model_decoder_layers_18_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[943]
        model_decoder_layers_18_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[944]
        alloc1433: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1432, model_decoder_layers_18_final_layer_norm_weight5, model_decoder_layers_18_final_layer_norm_bias5, alloc1433)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias5)
        model_decoder_layers_18_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[939]
        model_decoder_layers_18_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[940]
        alloc1434: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1433, model_decoder_layers_18_fc1_weight5, model_decoder_layers_18_fc1_bias5, alloc1434)
        R.vm.kill_object(alloc1433)
        R.vm.kill_object(model_decoder_layers_18_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_18_fc1_bias5)
        model_decoder_layers_18_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[941]
        model_decoder_layers_18_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[942]
        alloc1435: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1434, model_decoder_layers_18_fc2_weight5, model_decoder_layers_18_fc2_bias5, alloc1432, alloc1435)
        R.vm.kill_object(alloc1432)
        R.vm.kill_object(alloc1434)
        R.vm.kill_object(model_decoder_layers_18_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_18_fc2_bias5)
        model_decoder_layers_19_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[952]
        model_decoder_layers_19_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[953]
        alloc1436: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1435, model_decoder_layers_19_self_attn_layer_norm_weight5, model_decoder_layers_19_self_attn_layer_norm_bias5, alloc1436)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias5)
        model_decoder_layers_19_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[948]
        model_decoder_layers_19_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[949]
        alloc1437: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_q_proj_weight5, model_decoder_layers_19_self_attn_q_proj_bias5, alloc1437)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias5)
        model_decoder_layers_19_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[945]
        alloc1438: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1436, model_decoder_layers_19_self_attn_k_proj_weight5, alloc1438)
        R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight5)
        model_decoder_layers_19_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[946]
        model_decoder_layers_19_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[947]
        alloc1439: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_v_proj_weight5, model_decoder_layers_19_self_attn_v_proj_bias5, alloc1439)
        R.vm.kill_object(alloc1436)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias5)
        alloc1440: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1437, alloc1438, alloc1439, alloc1440)
        R.vm.kill_object(alloc1437)
        R.vm.kill_object(alloc1438)
        R.vm.kill_object(alloc1439)
        alloc1441: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1439: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), alloc1440, alloc1441)
        R.vm.kill_object(alloc1440)
        lv253: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1441, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1441)
        model_decoder_layers_19_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[950]
        model_decoder_layers_19_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[951]
        alloc1442: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv253, model_decoder_layers_19_self_attn_out_proj_weight5, model_decoder_layers_19_self_attn_out_proj_bias5, alloc1435, alloc1442)
        R.vm.kill_object(alloc1435)
        R.vm.kill_object(lv253)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias5)
        model_decoder_layers_19_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[961]
        model_decoder_layers_19_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[962]
        alloc1443: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1442, model_decoder_layers_19_encoder_attn_layer_norm_weight5, model_decoder_layers_19_encoder_attn_layer_norm_bias5, alloc1443)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias5)
        model_decoder_layers_19_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[957]
        model_decoder_layers_19_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[958]
        alloc1444: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1443, model_decoder_layers_19_encoder_attn_q_proj_weight5, model_decoder_layers_19_encoder_attn_q_proj_bias5, alloc1444)
        R.vm.kill_object(alloc1443)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias5)
        lv256: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1444, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1444)
        alloc1445: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1443: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), lv256, alloc1445)
        R.vm.kill_object(lv256)
        lv257: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1445, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1445)
        model_decoder_layers_19_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[959]
        model_decoder_layers_19_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[960]
        alloc1446: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv257, model_decoder_layers_19_encoder_attn_out_proj_weight5, model_decoder_layers_19_encoder_attn_out_proj_bias5, alloc1442, alloc1446)
        R.vm.kill_object(alloc1442)
        R.vm.kill_object(lv257)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias5)
        model_decoder_layers_19_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[967]
        model_decoder_layers_19_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[968]
        alloc1447: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1446, model_decoder_layers_19_final_layer_norm_weight5, model_decoder_layers_19_final_layer_norm_bias5, alloc1447)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias5)
        model_decoder_layers_19_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[963]
        model_decoder_layers_19_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[964]
        alloc1448: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1447, model_decoder_layers_19_fc1_weight5, model_decoder_layers_19_fc1_bias5, alloc1448)
        R.vm.kill_object(alloc1447)
        R.vm.kill_object(model_decoder_layers_19_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_19_fc1_bias5)
        model_decoder_layers_19_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[965]
        model_decoder_layers_19_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[966]
        alloc1449: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1448, model_decoder_layers_19_fc2_weight5, model_decoder_layers_19_fc2_bias5, alloc1446, alloc1449)
        R.vm.kill_object(alloc1446)
        R.vm.kill_object(alloc1448)
        R.vm.kill_object(model_decoder_layers_19_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_19_fc2_bias5)
        model_decoder_layers_20_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[976]
        model_decoder_layers_20_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[977]
        alloc1450: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1449, model_decoder_layers_20_self_attn_layer_norm_weight5, model_decoder_layers_20_self_attn_layer_norm_bias5, alloc1450)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias5)
        model_decoder_layers_20_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[972]
        model_decoder_layers_20_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[973]
        alloc1451: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_q_proj_weight5, model_decoder_layers_20_self_attn_q_proj_bias5, alloc1451)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias5)
        model_decoder_layers_20_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[969]
        alloc1452: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1450, model_decoder_layers_20_self_attn_k_proj_weight5, alloc1452)
        R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight5)
        model_decoder_layers_20_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[970]
        model_decoder_layers_20_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[971]
        alloc1453: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_v_proj_weight5, model_decoder_layers_20_self_attn_v_proj_bias5, alloc1453)
        R.vm.kill_object(alloc1450)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias5)
        alloc1454: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1451, alloc1452, alloc1453, alloc1454)
        R.vm.kill_object(alloc1451)
        R.vm.kill_object(alloc1452)
        R.vm.kill_object(alloc1453)
        alloc1455: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), alloc1454, alloc1455)
        R.vm.kill_object(alloc1454)
        lv264_1: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1455, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1455)
        model_decoder_layers_20_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[974]
        model_decoder_layers_20_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[975]
        alloc1456: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv264_1, model_decoder_layers_20_self_attn_out_proj_weight5, model_decoder_layers_20_self_attn_out_proj_bias5, alloc1449, alloc1456)
        R.vm.kill_object(alloc1449)
        R.vm.kill_object(lv264_1)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias5)
        model_decoder_layers_20_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[985]
        model_decoder_layers_20_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[986]
        alloc1457: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1456, model_decoder_layers_20_encoder_attn_layer_norm_weight5, model_decoder_layers_20_encoder_attn_layer_norm_bias5, alloc1457)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias5)
        model_decoder_layers_20_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[981]
        model_decoder_layers_20_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[982]
        alloc1458: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1457, model_decoder_layers_20_encoder_attn_q_proj_weight5, model_decoder_layers_20_encoder_attn_q_proj_bias5, alloc1458)
        R.vm.kill_object(alloc1457)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias5)
        lv267: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1458, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1458)
        alloc1459: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), lv267, alloc1459)
        R.vm.kill_object(lv267)
        lv268: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1459, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1459)
        model_decoder_layers_20_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[983]
        model_decoder_layers_20_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[984]
        alloc1460: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv268, model_decoder_layers_20_encoder_attn_out_proj_weight5, model_decoder_layers_20_encoder_attn_out_proj_bias5, alloc1456, alloc1460)
        R.vm.kill_object(alloc1456)
        R.vm.kill_object(lv268)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias5)
        model_decoder_layers_20_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[991]
        model_decoder_layers_20_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[992]
        alloc1461: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1460, model_decoder_layers_20_final_layer_norm_weight5, model_decoder_layers_20_final_layer_norm_bias5, alloc1461)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias5)
        model_decoder_layers_20_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[987]
        model_decoder_layers_20_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[988]
        alloc1462: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1461, model_decoder_layers_20_fc1_weight5, model_decoder_layers_20_fc1_bias5, alloc1462)
        R.vm.kill_object(alloc1461)
        R.vm.kill_object(model_decoder_layers_20_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_20_fc1_bias5)
        model_decoder_layers_20_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[989]
        model_decoder_layers_20_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[990]
        alloc1463: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1462, model_decoder_layers_20_fc2_weight5, model_decoder_layers_20_fc2_bias5, alloc1460, alloc1463)
        R.vm.kill_object(alloc1460)
        R.vm.kill_object(alloc1462)
        R.vm.kill_object(model_decoder_layers_20_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_20_fc2_bias5)
        model_decoder_layers_21_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1000]
        model_decoder_layers_21_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1001]
        alloc1464: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1463, model_decoder_layers_21_self_attn_layer_norm_weight5, model_decoder_layers_21_self_attn_layer_norm_bias5, alloc1464)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias5)
        model_decoder_layers_21_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[996]
        model_decoder_layers_21_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[997]
        alloc1465: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_q_proj_weight5, model_decoder_layers_21_self_attn_q_proj_bias5, alloc1465)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias5)
        model_decoder_layers_21_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[993]
        alloc1466: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1464, model_decoder_layers_21_self_attn_k_proj_weight5, alloc1466)
        R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight5)
        model_decoder_layers_21_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[994]
        model_decoder_layers_21_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[995]
        alloc1467: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_v_proj_weight5, model_decoder_layers_21_self_attn_v_proj_bias5, alloc1467)
        R.vm.kill_object(alloc1464)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias5)
        alloc1468: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1465, alloc1466, alloc1467, alloc1468)
        R.vm.kill_object(alloc1465)
        R.vm.kill_object(alloc1466)
        R.vm.kill_object(alloc1467)
        alloc1469: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1467: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), alloc1468, alloc1469)
        R.vm.kill_object(alloc1468)
        lv275: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1469, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1469)
        model_decoder_layers_21_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[998]
        model_decoder_layers_21_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[999]
        alloc1470: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv275, model_decoder_layers_21_self_attn_out_proj_weight5, model_decoder_layers_21_self_attn_out_proj_bias5, alloc1463, alloc1470)
        R.vm.kill_object(alloc1463)
        R.vm.kill_object(lv275)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias5)
        model_decoder_layers_21_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1009]
        model_decoder_layers_21_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1010]
        alloc1471: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1470, model_decoder_layers_21_encoder_attn_layer_norm_weight5, model_decoder_layers_21_encoder_attn_layer_norm_bias5, alloc1471)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias5)
        model_decoder_layers_21_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005]
        model_decoder_layers_21_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1006]
        alloc1472: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1471, model_decoder_layers_21_encoder_attn_q_proj_weight5, model_decoder_layers_21_encoder_attn_q_proj_bias5, alloc1472)
        R.vm.kill_object(alloc1471)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias5)
        lv278: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1472, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1472)
        alloc1473: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1471: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), lv278, alloc1473)
        R.vm.kill_object(lv278)
        lv279: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1473, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1473)
        model_decoder_layers_21_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007]
        model_decoder_layers_21_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1008]
        alloc1474: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv279, model_decoder_layers_21_encoder_attn_out_proj_weight5, model_decoder_layers_21_encoder_attn_out_proj_bias5, alloc1470, alloc1474)
        R.vm.kill_object(alloc1470)
        R.vm.kill_object(lv279)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias5)
        model_decoder_layers_21_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1015]
        model_decoder_layers_21_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1016]
        alloc1475: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1474, model_decoder_layers_21_final_layer_norm_weight5, model_decoder_layers_21_final_layer_norm_bias5, alloc1475)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias5)
        model_decoder_layers_21_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011]
        model_decoder_layers_21_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1012]
        alloc1476: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1475, model_decoder_layers_21_fc1_weight5, model_decoder_layers_21_fc1_bias5, alloc1476)
        R.vm.kill_object(alloc1475)
        R.vm.kill_object(model_decoder_layers_21_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_21_fc1_bias5)
        model_decoder_layers_21_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013]
        model_decoder_layers_21_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1014]
        alloc1477: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1476, model_decoder_layers_21_fc2_weight5, model_decoder_layers_21_fc2_bias5, alloc1474, alloc1477)
        R.vm.kill_object(alloc1474)
        R.vm.kill_object(alloc1476)
        R.vm.kill_object(model_decoder_layers_21_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_21_fc2_bias5)
        model_decoder_layers_22_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1024]
        model_decoder_layers_22_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1025]
        alloc1478: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1477, model_decoder_layers_22_self_attn_layer_norm_weight5, model_decoder_layers_22_self_attn_layer_norm_bias5, alloc1478)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias5)
        model_decoder_layers_22_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020]
        model_decoder_layers_22_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1021]
        alloc1479: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_q_proj_weight5, model_decoder_layers_22_self_attn_q_proj_bias5, alloc1479)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias5)
        model_decoder_layers_22_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017]
        alloc1480: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1478, model_decoder_layers_22_self_attn_k_proj_weight5, alloc1480)
        R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight5)
        model_decoder_layers_22_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018]
        model_decoder_layers_22_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1019]
        alloc1481: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_v_proj_weight5, model_decoder_layers_22_self_attn_v_proj_bias5, alloc1481)
        R.vm.kill_object(alloc1478)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias5)
        alloc1482: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1479, alloc1480, alloc1481, alloc1482)
        R.vm.kill_object(alloc1479)
        R.vm.kill_object(alloc1480)
        R.vm.kill_object(alloc1481)
        alloc1483: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1481: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), alloc1482, alloc1483)
        R.vm.kill_object(alloc1482)
        lv286: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1483, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1483)
        model_decoder_layers_22_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022]
        model_decoder_layers_22_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1023]
        alloc1484: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv286, model_decoder_layers_22_self_attn_out_proj_weight5, model_decoder_layers_22_self_attn_out_proj_bias5, alloc1477, alloc1484)
        R.vm.kill_object(alloc1477)
        R.vm.kill_object(lv286)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias5)
        model_decoder_layers_22_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1033]
        model_decoder_layers_22_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1034]
        alloc1485: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1484, model_decoder_layers_22_encoder_attn_layer_norm_weight5, model_decoder_layers_22_encoder_attn_layer_norm_bias5, alloc1485)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias5)
        model_decoder_layers_22_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029]
        model_decoder_layers_22_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1030]
        alloc1486: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1485, model_decoder_layers_22_encoder_attn_q_proj_weight5, model_decoder_layers_22_encoder_attn_q_proj_bias5, alloc1486)
        R.vm.kill_object(alloc1485)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias5)
        lv289: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1486, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1486)
        alloc1487: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1485: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), lv289, alloc1487)
        R.vm.kill_object(lv289)
        lv290: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1487, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1487)
        model_decoder_layers_22_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031]
        model_decoder_layers_22_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1032]
        alloc1488: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv290, model_decoder_layers_22_encoder_attn_out_proj_weight5, model_decoder_layers_22_encoder_attn_out_proj_bias5, alloc1484, alloc1488)
        R.vm.kill_object(alloc1484)
        R.vm.kill_object(lv290)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias5)
        model_decoder_layers_22_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1039]
        model_decoder_layers_22_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1040]
        alloc1489: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1488, model_decoder_layers_22_final_layer_norm_weight5, model_decoder_layers_22_final_layer_norm_bias5, alloc1489)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias5)
        model_decoder_layers_22_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035]
        model_decoder_layers_22_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1036]
        alloc1490: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1489, model_decoder_layers_22_fc1_weight5, model_decoder_layers_22_fc1_bias5, alloc1490)
        R.vm.kill_object(alloc1489)
        R.vm.kill_object(model_decoder_layers_22_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_22_fc1_bias5)
        model_decoder_layers_22_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037]
        model_decoder_layers_22_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1038]
        alloc1491: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1490, model_decoder_layers_22_fc2_weight5, model_decoder_layers_22_fc2_bias5, alloc1488, alloc1491)
        R.vm.kill_object(alloc1488)
        R.vm.kill_object(alloc1490)
        R.vm.kill_object(model_decoder_layers_22_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_22_fc2_bias5)
        model_decoder_layers_23_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1048]
        model_decoder_layers_23_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1049]
        alloc1492: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1491, model_decoder_layers_23_self_attn_layer_norm_weight5, model_decoder_layers_23_self_attn_layer_norm_bias5, alloc1492)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias5)
        model_decoder_layers_23_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044]
        model_decoder_layers_23_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1045]
        alloc1493: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_q_proj_weight5, model_decoder_layers_23_self_attn_q_proj_bias5, alloc1493)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias5)
        model_decoder_layers_23_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041]
        alloc1494: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1492, model_decoder_layers_23_self_attn_k_proj_weight5, alloc1494)
        R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight5)
        model_decoder_layers_23_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042]
        model_decoder_layers_23_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1043]
        alloc1495: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_v_proj_weight5, model_decoder_layers_23_self_attn_v_proj_bias5, alloc1495)
        R.vm.kill_object(alloc1492)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias5)
        alloc1496: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1493, alloc1494, alloc1495, alloc1496)
        R.vm.kill_object(alloc1493)
        R.vm.kill_object(alloc1494)
        R.vm.kill_object(alloc1495)
        alloc1497: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1495: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), alloc1496, alloc1497)
        R.vm.kill_object(alloc1496)
        lv297: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1497, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1497)
        model_decoder_layers_23_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046]
        model_decoder_layers_23_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1047]
        alloc1498: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv297, model_decoder_layers_23_self_attn_out_proj_weight5, model_decoder_layers_23_self_attn_out_proj_bias5, alloc1491, alloc1498)
        R.vm.kill_object(alloc1491)
        R.vm.kill_object(lv297)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias5)
        model_decoder_layers_23_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1057]
        model_decoder_layers_23_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1058]
        alloc1499: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1498, model_decoder_layers_23_encoder_attn_layer_norm_weight5, model_decoder_layers_23_encoder_attn_layer_norm_bias5, alloc1499)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias5)
        model_decoder_layers_23_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053]
        model_decoder_layers_23_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1054]
        alloc1500: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1499, model_decoder_layers_23_encoder_attn_q_proj_weight5, model_decoder_layers_23_encoder_attn_q_proj_bias5, alloc1500)
        R.vm.kill_object(alloc1499)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias5)
        lv300: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1500, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1500)
        alloc1501: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), lv300, alloc1501)
        R.vm.kill_object(lv300)
        lv301: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1501, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1501)
        model_decoder_layers_23_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055]
        model_decoder_layers_23_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1056]
        alloc1502: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv301, model_decoder_layers_23_encoder_attn_out_proj_weight5, model_decoder_layers_23_encoder_attn_out_proj_bias5, alloc1498, alloc1502)
        R.vm.kill_object(alloc1498)
        R.vm.kill_object(lv301)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias5)
        model_decoder_layers_23_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1063]
        model_decoder_layers_23_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1064]
        alloc1503: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1502, model_decoder_layers_23_final_layer_norm_weight5, model_decoder_layers_23_final_layer_norm_bias5, alloc1503)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias5)
        model_decoder_layers_23_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059]
        model_decoder_layers_23_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1060]
        alloc1504: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1503, model_decoder_layers_23_fc1_weight5, model_decoder_layers_23_fc1_bias5, alloc1504)
        R.vm.kill_object(alloc1503)
        R.vm.kill_object(model_decoder_layers_23_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_23_fc1_bias5)
        model_decoder_layers_23_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061]
        model_decoder_layers_23_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1062]
        alloc1505: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1504, model_decoder_layers_23_fc2_weight5, model_decoder_layers_23_fc2_bias5, alloc1502, alloc1505)
        R.vm.kill_object(alloc1502)
        R.vm.kill_object(alloc1504)
        R.vm.kill_object(model_decoder_layers_23_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_23_fc2_bias5)
        model_decoder_layers_24_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1072]
        model_decoder_layers_24_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1073]
        alloc1506: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1505, model_decoder_layers_24_self_attn_layer_norm_weight5, model_decoder_layers_24_self_attn_layer_norm_bias5, alloc1506)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias5)
        model_decoder_layers_24_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068]
        model_decoder_layers_24_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1069]
        alloc1507: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_q_proj_weight5, model_decoder_layers_24_self_attn_q_proj_bias5, alloc1507)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias5)
        model_decoder_layers_24_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065]
        alloc1508: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1506, model_decoder_layers_24_self_attn_k_proj_weight5, alloc1508)
        R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight5)
        model_decoder_layers_24_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066]
        model_decoder_layers_24_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1067]
        alloc1509: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_v_proj_weight5, model_decoder_layers_24_self_attn_v_proj_bias5, alloc1509)
        R.vm.kill_object(alloc1506)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias5)
        alloc1510: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1507, alloc1508, alloc1509, alloc1510)
        R.vm.kill_object(alloc1507)
        R.vm.kill_object(alloc1508)
        R.vm.kill_object(alloc1509)
        alloc1511: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1509: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), alloc1510, alloc1511)
        R.vm.kill_object(alloc1510)
        lv308: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1511, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1511)
        model_decoder_layers_24_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070]
        model_decoder_layers_24_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1071]
        alloc1512: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv308, model_decoder_layers_24_self_attn_out_proj_weight5, model_decoder_layers_24_self_attn_out_proj_bias5, alloc1505, alloc1512)
        R.vm.kill_object(alloc1505)
        R.vm.kill_object(lv308)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias5)
        model_decoder_layers_24_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1081]
        model_decoder_layers_24_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1082]
        alloc1513: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1512, model_decoder_layers_24_encoder_attn_layer_norm_weight5, model_decoder_layers_24_encoder_attn_layer_norm_bias5, alloc1513)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias5)
        model_decoder_layers_24_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077]
        model_decoder_layers_24_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1078]
        alloc1514: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1513, model_decoder_layers_24_encoder_attn_q_proj_weight5, model_decoder_layers_24_encoder_attn_q_proj_bias5, alloc1514)
        R.vm.kill_object(alloc1513)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias5)
        lv311: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1514, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1514)
        alloc1515: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1513: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), lv311, alloc1515)
        R.vm.kill_object(lv311)
        lv312: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1515, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1515)
        model_decoder_layers_24_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079]
        model_decoder_layers_24_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1080]
        alloc1516: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv312, model_decoder_layers_24_encoder_attn_out_proj_weight5, model_decoder_layers_24_encoder_attn_out_proj_bias5, alloc1512, alloc1516)
        R.vm.kill_object(alloc1512)
        R.vm.kill_object(lv312)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias5)
        model_decoder_layers_24_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1087]
        model_decoder_layers_24_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1088]
        alloc1517: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1516, model_decoder_layers_24_final_layer_norm_weight5, model_decoder_layers_24_final_layer_norm_bias5, alloc1517)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias5)
        model_decoder_layers_24_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083]
        model_decoder_layers_24_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1084]
        alloc1518: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1517, model_decoder_layers_24_fc1_weight5, model_decoder_layers_24_fc1_bias5, alloc1518)
        R.vm.kill_object(alloc1517)
        R.vm.kill_object(model_decoder_layers_24_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_24_fc1_bias5)
        model_decoder_layers_24_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085]
        model_decoder_layers_24_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1086]
        alloc1519: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1518, model_decoder_layers_24_fc2_weight5, model_decoder_layers_24_fc2_bias5, alloc1516, alloc1519)
        R.vm.kill_object(alloc1516)
        R.vm.kill_object(alloc1518)
        R.vm.kill_object(model_decoder_layers_24_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_24_fc2_bias5)
        model_decoder_layers_25_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1096]
        model_decoder_layers_25_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1097]
        alloc1520: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1519, model_decoder_layers_25_self_attn_layer_norm_weight5, model_decoder_layers_25_self_attn_layer_norm_bias5, alloc1520)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias5)
        model_decoder_layers_25_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092]
        model_decoder_layers_25_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1093]
        alloc1521: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_q_proj_weight5, model_decoder_layers_25_self_attn_q_proj_bias5, alloc1521)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias5)
        model_decoder_layers_25_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089]
        alloc1522: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1520, model_decoder_layers_25_self_attn_k_proj_weight5, alloc1522)
        R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight5)
        model_decoder_layers_25_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090]
        model_decoder_layers_25_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1091]
        alloc1523: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_v_proj_weight5, model_decoder_layers_25_self_attn_v_proj_bias5, alloc1523)
        R.vm.kill_object(alloc1520)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias5)
        alloc1524: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1521, alloc1522, alloc1523, alloc1524)
        R.vm.kill_object(alloc1521)
        R.vm.kill_object(alloc1522)
        R.vm.kill_object(alloc1523)
        alloc1525: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1523: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), alloc1524, alloc1525)
        R.vm.kill_object(alloc1524)
        lv319: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1525, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1525)
        model_decoder_layers_25_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094]
        model_decoder_layers_25_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1095]
        alloc1526: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv319, model_decoder_layers_25_self_attn_out_proj_weight5, model_decoder_layers_25_self_attn_out_proj_bias5, alloc1519, alloc1526)
        R.vm.kill_object(alloc1519)
        R.vm.kill_object(lv319)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias5)
        model_decoder_layers_25_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1105]
        model_decoder_layers_25_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1106]
        alloc1527: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1526, model_decoder_layers_25_encoder_attn_layer_norm_weight5, model_decoder_layers_25_encoder_attn_layer_norm_bias5, alloc1527)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias5)
        model_decoder_layers_25_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101]
        model_decoder_layers_25_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1102]
        alloc1528: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1527, model_decoder_layers_25_encoder_attn_q_proj_weight5, model_decoder_layers_25_encoder_attn_q_proj_bias5, alloc1528)
        R.vm.kill_object(alloc1527)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias5)
        lv322: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1528, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1528)
        alloc1529: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1527: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), lv322, alloc1529)
        R.vm.kill_object(lv322)
        lv323: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1529, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1529)
        model_decoder_layers_25_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103]
        model_decoder_layers_25_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1104]
        alloc1530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv323, model_decoder_layers_25_encoder_attn_out_proj_weight5, model_decoder_layers_25_encoder_attn_out_proj_bias5, alloc1526, alloc1530)
        R.vm.kill_object(alloc1526)
        R.vm.kill_object(lv323)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias5)
        model_decoder_layers_25_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1111]
        model_decoder_layers_25_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1112]
        alloc1531: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1530, model_decoder_layers_25_final_layer_norm_weight5, model_decoder_layers_25_final_layer_norm_bias5, alloc1531)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias5)
        model_decoder_layers_25_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107]
        model_decoder_layers_25_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1108]
        alloc1532: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1531, model_decoder_layers_25_fc1_weight5, model_decoder_layers_25_fc1_bias5, alloc1532)
        R.vm.kill_object(alloc1531)
        R.vm.kill_object(model_decoder_layers_25_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_25_fc1_bias5)
        model_decoder_layers_25_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109]
        model_decoder_layers_25_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1110]
        alloc1533: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1532, model_decoder_layers_25_fc2_weight5, model_decoder_layers_25_fc2_bias5, alloc1530, alloc1533)
        R.vm.kill_object(alloc1530)
        R.vm.kill_object(alloc1532)
        R.vm.kill_object(model_decoder_layers_25_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_25_fc2_bias5)
        model_decoder_layers_26_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1120]
        model_decoder_layers_26_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1121]
        alloc1534: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1533, model_decoder_layers_26_self_attn_layer_norm_weight5, model_decoder_layers_26_self_attn_layer_norm_bias5, alloc1534)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias5)
        model_decoder_layers_26_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116]
        model_decoder_layers_26_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1117]
        alloc1535: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_q_proj_weight5, model_decoder_layers_26_self_attn_q_proj_bias5, alloc1535)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias5)
        model_decoder_layers_26_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113]
        alloc1536: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1534, model_decoder_layers_26_self_attn_k_proj_weight5, alloc1536)
        R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight5)
        model_decoder_layers_26_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114]
        model_decoder_layers_26_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1115]
        alloc1537: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_v_proj_weight5, model_decoder_layers_26_self_attn_v_proj_bias5, alloc1537)
        R.vm.kill_object(alloc1534)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias5)
        alloc1538: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1535, alloc1536, alloc1537, alloc1538)
        R.vm.kill_object(alloc1535)
        R.vm.kill_object(alloc1536)
        R.vm.kill_object(alloc1537)
        alloc1539: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), alloc1538, alloc1539)
        R.vm.kill_object(alloc1538)
        lv330: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1539, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1539)
        model_decoder_layers_26_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118]
        model_decoder_layers_26_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1119]
        alloc1540: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv330, model_decoder_layers_26_self_attn_out_proj_weight5, model_decoder_layers_26_self_attn_out_proj_bias5, alloc1533, alloc1540)
        R.vm.kill_object(alloc1533)
        R.vm.kill_object(lv330)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias5)
        model_decoder_layers_26_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1129]
        model_decoder_layers_26_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1130]
        alloc1541: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1540, model_decoder_layers_26_encoder_attn_layer_norm_weight5, model_decoder_layers_26_encoder_attn_layer_norm_bias5, alloc1541)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias5)
        model_decoder_layers_26_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125]
        model_decoder_layers_26_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1126]
        alloc1542: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1541, model_decoder_layers_26_encoder_attn_q_proj_weight5, model_decoder_layers_26_encoder_attn_q_proj_bias5, alloc1542)
        R.vm.kill_object(alloc1541)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias5)
        lv333: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1542, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1542)
        alloc1543: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1541: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), lv333, alloc1543)
        R.vm.kill_object(lv333)
        lv334: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1543, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1543)
        model_decoder_layers_26_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127]
        model_decoder_layers_26_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1128]
        alloc1544: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv334, model_decoder_layers_26_encoder_attn_out_proj_weight5, model_decoder_layers_26_encoder_attn_out_proj_bias5, alloc1540, alloc1544)
        R.vm.kill_object(alloc1540)
        R.vm.kill_object(lv334)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias5)
        model_decoder_layers_26_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1135]
        model_decoder_layers_26_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1136]
        alloc1545: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1544, model_decoder_layers_26_final_layer_norm_weight5, model_decoder_layers_26_final_layer_norm_bias5, alloc1545)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias5)
        model_decoder_layers_26_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131]
        model_decoder_layers_26_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1132]
        alloc1546: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1545, model_decoder_layers_26_fc1_weight5, model_decoder_layers_26_fc1_bias5, alloc1546)
        R.vm.kill_object(alloc1545)
        R.vm.kill_object(model_decoder_layers_26_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_26_fc1_bias5)
        model_decoder_layers_26_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133]
        model_decoder_layers_26_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1134]
        alloc1547: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1546, model_decoder_layers_26_fc2_weight5, model_decoder_layers_26_fc2_bias5, alloc1544, alloc1547)
        R.vm.kill_object(alloc1544)
        R.vm.kill_object(alloc1546)
        R.vm.kill_object(model_decoder_layers_26_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_26_fc2_bias5)
        model_decoder_layers_27_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1144]
        model_decoder_layers_27_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1145]
        alloc1548: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1547, model_decoder_layers_27_self_attn_layer_norm_weight5, model_decoder_layers_27_self_attn_layer_norm_bias5, alloc1548)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias5)
        model_decoder_layers_27_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140]
        model_decoder_layers_27_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1141]
        alloc1549: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_q_proj_weight5, model_decoder_layers_27_self_attn_q_proj_bias5, alloc1549)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias5)
        model_decoder_layers_27_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137]
        alloc1550: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1548, model_decoder_layers_27_self_attn_k_proj_weight5, alloc1550)
        R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight5)
        model_decoder_layers_27_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138]
        model_decoder_layers_27_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1139]
        alloc1551: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_v_proj_weight5, model_decoder_layers_27_self_attn_v_proj_bias5, alloc1551)
        R.vm.kill_object(alloc1548)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias5)
        alloc1552: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1549, alloc1550, alloc1551, alloc1552)
        R.vm.kill_object(alloc1549)
        R.vm.kill_object(alloc1550)
        R.vm.kill_object(alloc1551)
        alloc1553: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1551: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), alloc1552, alloc1553)
        R.vm.kill_object(alloc1552)
        lv341: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1553, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1553)
        model_decoder_layers_27_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142]
        model_decoder_layers_27_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1143]
        alloc1554: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv341, model_decoder_layers_27_self_attn_out_proj_weight5, model_decoder_layers_27_self_attn_out_proj_bias5, alloc1547, alloc1554)
        R.vm.kill_object(alloc1547)
        R.vm.kill_object(lv341)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias5)
        model_decoder_layers_27_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1153]
        model_decoder_layers_27_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1154]
        alloc1555: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1554, model_decoder_layers_27_encoder_attn_layer_norm_weight5, model_decoder_layers_27_encoder_attn_layer_norm_bias5, alloc1555)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias5)
        model_decoder_layers_27_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149]
        model_decoder_layers_27_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1150]
        alloc1556: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1555, model_decoder_layers_27_encoder_attn_q_proj_weight5, model_decoder_layers_27_encoder_attn_q_proj_bias5, alloc1556)
        R.vm.kill_object(alloc1555)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias5)
        lv344: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1556, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1556)
        alloc1557: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1555: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), lv344, alloc1557)
        R.vm.kill_object(lv344)
        lv345: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1557, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1557)
        model_decoder_layers_27_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151]
        model_decoder_layers_27_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1152]
        alloc1558: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv345, model_decoder_layers_27_encoder_attn_out_proj_weight5, model_decoder_layers_27_encoder_attn_out_proj_bias5, alloc1554, alloc1558)
        R.vm.kill_object(alloc1554)
        R.vm.kill_object(lv345)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias5)
        model_decoder_layers_27_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1159]
        model_decoder_layers_27_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1160]
        alloc1559: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1558, model_decoder_layers_27_final_layer_norm_weight5, model_decoder_layers_27_final_layer_norm_bias5, alloc1559)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias5)
        model_decoder_layers_27_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155]
        model_decoder_layers_27_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1156]
        alloc1560: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1559, model_decoder_layers_27_fc1_weight5, model_decoder_layers_27_fc1_bias5, alloc1560)
        R.vm.kill_object(alloc1559)
        R.vm.kill_object(model_decoder_layers_27_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_27_fc1_bias5)
        model_decoder_layers_27_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157]
        model_decoder_layers_27_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1158]
        alloc1561: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1560, model_decoder_layers_27_fc2_weight5, model_decoder_layers_27_fc2_bias5, alloc1558, alloc1561)
        R.vm.kill_object(alloc1558)
        R.vm.kill_object(alloc1560)
        R.vm.kill_object(model_decoder_layers_27_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_27_fc2_bias5)
        model_decoder_layers_28_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1168]
        model_decoder_layers_28_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1169]
        alloc1562: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1561, model_decoder_layers_28_self_attn_layer_norm_weight5, model_decoder_layers_28_self_attn_layer_norm_bias5, alloc1562)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias5)
        model_decoder_layers_28_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164]
        model_decoder_layers_28_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1165]
        alloc1563: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_q_proj_weight5, model_decoder_layers_28_self_attn_q_proj_bias5, alloc1563)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias5)
        model_decoder_layers_28_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161]
        alloc1564: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1562, model_decoder_layers_28_self_attn_k_proj_weight5, alloc1564)
        R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight5)
        model_decoder_layers_28_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162]
        model_decoder_layers_28_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1163]
        alloc1565: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_v_proj_weight5, model_decoder_layers_28_self_attn_v_proj_bias5, alloc1565)
        R.vm.kill_object(alloc1562)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias5)
        alloc1566: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1563, alloc1564, alloc1565, alloc1566)
        R.vm.kill_object(alloc1563)
        R.vm.kill_object(alloc1564)
        R.vm.kill_object(alloc1565)
        alloc1567: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1565: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), alloc1566, alloc1567)
        R.vm.kill_object(alloc1566)
        lv352: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1567, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1567)
        model_decoder_layers_28_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166]
        model_decoder_layers_28_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1167]
        alloc1568: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv352, model_decoder_layers_28_self_attn_out_proj_weight5, model_decoder_layers_28_self_attn_out_proj_bias5, alloc1561, alloc1568)
        R.vm.kill_object(alloc1561)
        R.vm.kill_object(lv352)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias5)
        model_decoder_layers_28_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1177]
        model_decoder_layers_28_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1178]
        alloc1569: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1568, model_decoder_layers_28_encoder_attn_layer_norm_weight5, model_decoder_layers_28_encoder_attn_layer_norm_bias5, alloc1569)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias5)
        model_decoder_layers_28_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173]
        model_decoder_layers_28_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1174]
        alloc1570: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1569, model_decoder_layers_28_encoder_attn_q_proj_weight5, model_decoder_layers_28_encoder_attn_q_proj_bias5, alloc1570)
        R.vm.kill_object(alloc1569)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias5)
        lv355: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1570, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1570)
        alloc1571: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1569: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), lv355, alloc1571)
        R.vm.kill_object(lv355)
        lv356: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1571, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1571)
        model_decoder_layers_28_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175]
        model_decoder_layers_28_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1176]
        alloc1572: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv356, model_decoder_layers_28_encoder_attn_out_proj_weight5, model_decoder_layers_28_encoder_attn_out_proj_bias5, alloc1568, alloc1572)
        R.vm.kill_object(alloc1568)
        R.vm.kill_object(lv356)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias5)
        model_decoder_layers_28_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1183]
        model_decoder_layers_28_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1184]
        alloc1573: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1572, model_decoder_layers_28_final_layer_norm_weight5, model_decoder_layers_28_final_layer_norm_bias5, alloc1573)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias5)
        model_decoder_layers_28_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179]
        model_decoder_layers_28_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1180]
        alloc1574: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1573, model_decoder_layers_28_fc1_weight5, model_decoder_layers_28_fc1_bias5, alloc1574)
        R.vm.kill_object(alloc1573)
        R.vm.kill_object(model_decoder_layers_28_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_28_fc1_bias5)
        model_decoder_layers_28_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181]
        model_decoder_layers_28_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1182]
        alloc1575: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1574, model_decoder_layers_28_fc2_weight5, model_decoder_layers_28_fc2_bias5, alloc1572, alloc1575)
        R.vm.kill_object(alloc1572)
        R.vm.kill_object(alloc1574)
        R.vm.kill_object(model_decoder_layers_28_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_28_fc2_bias5)
        model_decoder_layers_29_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1192]
        model_decoder_layers_29_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1193]
        alloc1576: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1575, model_decoder_layers_29_self_attn_layer_norm_weight5, model_decoder_layers_29_self_attn_layer_norm_bias5, alloc1576)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias5)
        model_decoder_layers_29_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188]
        model_decoder_layers_29_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1189]
        alloc1577: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_q_proj_weight5, model_decoder_layers_29_self_attn_q_proj_bias5, alloc1577)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias5)
        model_decoder_layers_29_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185]
        alloc1578: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1576, model_decoder_layers_29_self_attn_k_proj_weight5, alloc1578)
        R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight5)
        model_decoder_layers_29_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186]
        model_decoder_layers_29_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1187]
        alloc1579: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_v_proj_weight5, model_decoder_layers_29_self_attn_v_proj_bias5, alloc1579)
        R.vm.kill_object(alloc1576)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias5)
        alloc1580: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1577, alloc1578, alloc1579, alloc1580)
        R.vm.kill_object(alloc1577)
        R.vm.kill_object(alloc1578)
        R.vm.kill_object(alloc1579)
        alloc1581: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1579: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), alloc1580, alloc1581)
        R.vm.kill_object(alloc1580)
        lv363: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1581, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1581)
        model_decoder_layers_29_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190]
        model_decoder_layers_29_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1191]
        alloc1582: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv363, model_decoder_layers_29_self_attn_out_proj_weight5, model_decoder_layers_29_self_attn_out_proj_bias5, alloc1575, alloc1582)
        R.vm.kill_object(alloc1575)
        R.vm.kill_object(lv363)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias5)
        model_decoder_layers_29_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1201]
        model_decoder_layers_29_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1202]
        alloc1583: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1582, model_decoder_layers_29_encoder_attn_layer_norm_weight5, model_decoder_layers_29_encoder_attn_layer_norm_bias5, alloc1583)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias5)
        model_decoder_layers_29_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197]
        model_decoder_layers_29_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1198]
        alloc1584: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1583, model_decoder_layers_29_encoder_attn_q_proj_weight5, model_decoder_layers_29_encoder_attn_q_proj_bias5, alloc1584)
        R.vm.kill_object(alloc1583)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias5)
        lv366: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1584, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1584)
        alloc1585: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1583: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), lv366, alloc1585)
        R.vm.kill_object(lv366)
        lv367: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1585, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1585)
        model_decoder_layers_29_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199]
        model_decoder_layers_29_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1200]
        alloc1586: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv367, model_decoder_layers_29_encoder_attn_out_proj_weight5, model_decoder_layers_29_encoder_attn_out_proj_bias5, alloc1582, alloc1586)
        R.vm.kill_object(alloc1582)
        R.vm.kill_object(lv367)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias5)
        model_decoder_layers_29_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1207]
        model_decoder_layers_29_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1208]
        alloc1587: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1586, model_decoder_layers_29_final_layer_norm_weight5, model_decoder_layers_29_final_layer_norm_bias5, alloc1587)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias5)
        model_decoder_layers_29_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203]
        model_decoder_layers_29_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1204]
        alloc1588: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1587, model_decoder_layers_29_fc1_weight5, model_decoder_layers_29_fc1_bias5, alloc1588)
        R.vm.kill_object(alloc1587)
        R.vm.kill_object(model_decoder_layers_29_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_29_fc1_bias5)
        model_decoder_layers_29_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205]
        model_decoder_layers_29_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1206]
        alloc1589: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1588, model_decoder_layers_29_fc2_weight5, model_decoder_layers_29_fc2_bias5, alloc1586, alloc1589)
        R.vm.kill_object(alloc1586)
        R.vm.kill_object(alloc1588)
        R.vm.kill_object(model_decoder_layers_29_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_29_fc2_bias5)
        model_decoder_layers_30_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1216]
        model_decoder_layers_30_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1217]
        alloc1590: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1589, model_decoder_layers_30_self_attn_layer_norm_weight5, model_decoder_layers_30_self_attn_layer_norm_bias5, alloc1590)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias5)
        model_decoder_layers_30_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212]
        model_decoder_layers_30_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1213]
        alloc1591: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_q_proj_weight5, model_decoder_layers_30_self_attn_q_proj_bias5, alloc1591)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias5)
        model_decoder_layers_30_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209]
        alloc1592: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1590, model_decoder_layers_30_self_attn_k_proj_weight5, alloc1592)
        R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight5)
        model_decoder_layers_30_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210]
        model_decoder_layers_30_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1211]
        alloc1593: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_v_proj_weight5, model_decoder_layers_30_self_attn_v_proj_bias5, alloc1593)
        R.vm.kill_object(alloc1590)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias5)
        alloc1594: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1591, alloc1592, alloc1593, alloc1594)
        R.vm.kill_object(alloc1591)
        R.vm.kill_object(alloc1592)
        R.vm.kill_object(alloc1593)
        alloc1595: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1593: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), alloc1594, alloc1595)
        R.vm.kill_object(alloc1594)
        lv374: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1595, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1595)
        model_decoder_layers_30_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214]
        model_decoder_layers_30_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1215]
        alloc1596: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv374, model_decoder_layers_30_self_attn_out_proj_weight5, model_decoder_layers_30_self_attn_out_proj_bias5, alloc1589, alloc1596)
        R.vm.kill_object(alloc1589)
        R.vm.kill_object(lv374)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias5)
        model_decoder_layers_30_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1225]
        model_decoder_layers_30_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1226]
        alloc1597: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1596, model_decoder_layers_30_encoder_attn_layer_norm_weight5, model_decoder_layers_30_encoder_attn_layer_norm_bias5, alloc1597)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias5)
        model_decoder_layers_30_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221]
        model_decoder_layers_30_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1222]
        alloc1598: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1597, model_decoder_layers_30_encoder_attn_q_proj_weight5, model_decoder_layers_30_encoder_attn_q_proj_bias5, alloc1598)
        R.vm.kill_object(alloc1597)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias5)
        lv377: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1598, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1598)
        alloc1599: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1597: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), lv377, alloc1599)
        R.vm.kill_object(lv377)
        lv378: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1599, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1599)
        model_decoder_layers_30_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223]
        model_decoder_layers_30_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1224]
        alloc1600: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7_add6(lv378, model_decoder_layers_30_encoder_attn_out_proj_weight5, model_decoder_layers_30_encoder_attn_out_proj_bias5, alloc1596, alloc1600)
        R.vm.kill_object(alloc1596)
        R.vm.kill_object(lv378)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias5)
        model_decoder_layers_30_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1231]
        model_decoder_layers_30_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1232]
        alloc1601: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1600, model_decoder_layers_30_final_layer_norm_weight5, model_decoder_layers_30_final_layer_norm_bias5, alloc1601)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias5)
        model_decoder_layers_30_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227]
        model_decoder_layers_30_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1228]
        alloc1602: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        cls.fused_NT_matmul1_add8_gelu2(alloc1601, model_decoder_layers_30_fc1_weight5, model_decoder_layers_30_fc1_bias5, alloc1602)
        R.vm.kill_object(alloc1601)
        R.vm.kill_object(model_decoder_layers_30_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_30_fc1_bias5)
        model_decoder_layers_30_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229]
        model_decoder_layers_30_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1230]
        alloc1603: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul2_add7_add6(alloc1602, model_decoder_layers_30_fc2_weight5, model_decoder_layers_30_fc2_bias5, alloc1600, alloc1603)
        R.vm.kill_object(alloc1600)
        R.vm.kill_object(alloc1602)
        R.vm.kill_object(model_decoder_layers_30_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_30_fc2_bias5)
        model_decoder_layers_31_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1240]
        model_decoder_layers_31_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1241]
        alloc1604: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1603, model_decoder_layers_31_self_attn_layer_norm_weight5, model_decoder_layers_31_self_attn_layer_norm_bias5, alloc1604)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias5)
        model_decoder_layers_31_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236]
        model_decoder_layers_31_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1237]
        alloc1605: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_q_proj_weight5, model_decoder_layers_31_self_attn_q_proj_bias5, alloc1605)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias5)
        model_decoder_layers_31_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233]
        alloc1606: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.NT_matmul(alloc1604, model_decoder_layers_31_self_attn_k_proj_weight5, alloc1606)
        R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight5)
        model_decoder_layers_31_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234]
        model_decoder_layers_31_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1235]
        alloc1607: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_v_proj_weight5, model_decoder_layers_31_self_attn_v_proj_bias5, alloc1607)
        R.vm.kill_object(alloc1604)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight5)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias5)
        alloc1608: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16"))
        cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1605, alloc1606, alloc1607, alloc1608)
        R.vm.kill_object(alloc1605)
        R.vm.kill_object(alloc1606)
        R.vm.kill_object(alloc1607)
        alloc1609: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1607: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), alloc1608, alloc1609)
        R.vm.kill_object(alloc1608)
        lv385: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1609, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1609)
        model_decoder_layers_31_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238]
        model_decoder_layers_31_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1239]
        alloc1610: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        R.vm.kill_object(storage22)
        cls.fused_NT_matmul_add7_add6(lv385, model_decoder_layers_31_self_attn_out_proj_weight5, model_decoder_layers_31_self_attn_out_proj_bias5, alloc1603, alloc1610)
        R.vm.kill_object(alloc1603)
        R.vm.kill_object(lv385)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias5)
        model_decoder_layers_31_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1249]
        model_decoder_layers_31_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1250]
        alloc1611: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1610, model_decoder_layers_31_encoder_attn_layer_norm_weight5, model_decoder_layers_31_encoder_attn_layer_norm_bias5, alloc1611)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias5)
        model_decoder_layers_31_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245]
        model_decoder_layers_31_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1246]
        alloc1612: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.fused_NT_matmul_add7(alloc1611, model_decoder_layers_31_encoder_attn_q_proj_weight5, model_decoder_layers_31_encoder_attn_q_proj_bias5, alloc1612)
        R.vm.kill_object(alloc1611)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight5)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias5)
        lv388: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1612, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1612)
        alloc1613: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16"))
        _1611: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), lv388, alloc1613)
        R.vm.kill_object(lv388)
        lv389: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1613, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1613)
        model_decoder_layers_31_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247]
        model_decoder_layers_31_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1248]
        alloc1614: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        R.vm.kill_object(storage20)
        cls.fused_NT_matmul_add7_add6(lv389, model_decoder_layers_31_encoder_attn_out_proj_weight5, model_decoder_layers_31_encoder_attn_out_proj_bias5, alloc1610, alloc1614)
        R.vm.kill_object(alloc1610)
        R.vm.kill_object(lv389)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight5)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias5)
        model_decoder_layers_31_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1255]
        model_decoder_layers_31_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1256]
        alloc1615: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        cls.layer_norm3(alloc1614, model_decoder_layers_31_final_layer_norm_weight5, model_decoder_layers_31_final_layer_norm_bias5, alloc1615)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias5)
        model_decoder_layers_31_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251]
        model_decoder_layers_31_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1252]
        alloc1616: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16"))
        R.vm.kill_object(storage19)
        cls.fused_NT_matmul1_add8_gelu2(alloc1615, model_decoder_layers_31_fc1_weight5, model_decoder_layers_31_fc1_bias5, alloc1616)
        R.vm.kill_object(alloc1615)
        R.vm.kill_object(model_decoder_layers_31_fc1_weight5)
        R.vm.kill_object(model_decoder_layers_31_fc1_bias5)
        model_decoder_layers_31_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253]
        model_decoder_layers_31_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1254]
        alloc1617: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        R.vm.kill_object(storage21)
        cls.fused_NT_matmul2_add7_add6(alloc1616, model_decoder_layers_31_fc2_weight5, model_decoder_layers_31_fc2_bias5, alloc1614, alloc1617)
        R.vm.kill_object(alloc1614)
        R.vm.kill_object(alloc1616)
        R.vm.kill_object(model_decoder_layers_31_fc2_weight5)
        R.vm.kill_object(model_decoder_layers_31_fc2_bias5)
        model_decoder_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1257]
        model_decoder_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1258]
        alloc1618: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        R.vm.kill_object(storage23)
        cls.layer_norm3(alloc1617, model_decoder_layer_norm_weight5, model_decoder_layer_norm_bias5, alloc1618)
        R.vm.kill_object(alloc1617)
        R.vm.kill_object(model_decoder_layer_norm_weight5)
        R.vm.kill_object(model_decoder_layer_norm_bias5)
        storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc1619: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32"))
        R.vm.kill_object(storage)
        cls.NT_matmul3(alloc1618, model_decoder_embed_tokens_weight5, alloc1619)
        R.vm.kill_object(model_decoder_embed_tokens_weight5)
        R.vm.kill_object(alloc1618)
        return alloc1619

    @R.function
    def multinomial_from_uniform(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32")) -> R.Tensor(("num_samples",), dtype="int32"):
        num_samples = T.int64()
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        gv6: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        uniform_samples_1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv6, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),))
        gv7: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        sample_indices_1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv7, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),))
        storage3: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv8: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc3: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage3, R.prim_value(0), gv8, R.dtype("int32"))
        R.vm.kill_object(storage3)
        cls.parallel_sampling_from_prob(probs, uniform_samples_1, sample_indices_1, alloc3)
        R.vm.kill_object(uniform_samples_1)
        R.vm.kill_object(sample_indices_1)
        gv9: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        gv: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc3, gv9, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),))
        R.vm.kill_object(alloc3)
        return gv

    @R.function
    def prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"):
        seq_len = T.int64()
        R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=prefill, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        model_decoder_embed_tokens_weight4: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        gv2580: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),))
        reshape1030: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv2580, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),))
        model_decoder_embed_tokens_weight4_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487]
        storage37: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2581: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc1982: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2581, R.dtype("float16"))
        cls.take(model_decoder_embed_tokens_weight4_1, reshape1030, alloc1982)
        R.vm.kill_object(reshape1030)
        R.vm.kill_object(model_decoder_embed_tokens_weight4_1)
        gv2582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1031: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1982, gv2582, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1982)
        lv198: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),))
        model_decoder_embed_positions_weight4: R.Tensor((448, 1280), dtype="float16") = packed_params[488]
        storage38: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2583: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),))
        alloc1983: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2583, R.dtype("float16"))
        cls.take1(model_decoder_embed_positions_weight4, lv198, alloc1983)
        R.vm.kill_object(lv198)
        R.vm.kill_object(model_decoder_embed_positions_weight4)
        gv2584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1032: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1983, gv2584, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(alloc1983)
        storage39: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2585, R.dtype("float16"))
        cls.add5(reshape1031, reshape1032, alloc1984)
        R.vm.kill_object(reshape1031)
        R.vm.kill_object(reshape1032)
        model_decoder_layers_0_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[496]
        model_decoder_layers_0_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[497]
        gv2586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2586, R.dtype("float16"))
        cls.layer_norm2(alloc1984, model_decoder_layers_0_self_attn_layer_norm_weight4, model_decoder_layers_0_self_attn_layer_norm_bias4, alloc1985)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias4)
        model_decoder_layers_0_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[492]
        model_decoder_layers_0_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[493]
        gv2587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2587, R.dtype("float16"))
        _1985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_q_proj_bias4, alloc1986)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias4)
        gv2588: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1033: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1986, gv2588, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1986)
        model_decoder_layers_0_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[489]
        storage40: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2589, R.dtype("float16"))
        _1986: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight4, alloc1985, alloc1987)
        R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight4)
        gv2590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1034: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1987, gv2590, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1987)
        model_decoder_layers_0_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[490]
        model_decoder_layers_0_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[491]
        storage41: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2591, R.dtype("float16"))
        _1987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_v_proj_bias4, alloc1988)
        R.vm.kill_object(alloc1985)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias4)
        gv2592: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1035: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1988, gv2592, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1988)
        gv2593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc1989: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2593, R.dtype("float16"))
        cls.concatenate1(reshape1033, reshape1034, reshape1035, alloc1989)
        R.vm.kill_object(reshape1033)
        R.vm.kill_object(reshape1034)
        R.vm.kill_object(reshape1035)
        gv2594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1036: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1989, gv2594, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc1989)
        gv2595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2595, R.dtype("float16"))
        _1989: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1036, alloc1990)
        R.vm.kill_object(reshape1036)
        gv2596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1037: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1990, gv2596, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1990)
        gv2597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1038: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1037, gv2597, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1037)
        model_decoder_layers_0_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[494]
        model_decoder_layers_0_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[495]
        gv2598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2598, R.dtype("float16"))
        _1990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight4, reshape1038, model_decoder_layers_0_self_attn_out_proj_bias4, alloc1991)
        R.vm.kill_object(reshape1038)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias4)
        gv2599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2599, R.dtype("float16"))
        cls.add5(alloc1984, alloc1991, alloc1992)
        R.vm.kill_object(alloc1984)
        R.vm.kill_object(alloc1991)
        model_decoder_layers_0_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[505]
        model_decoder_layers_0_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[506]
        gv2600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2600, R.dtype("float16"))
        cls.layer_norm2(alloc1992, model_decoder_layers_0_encoder_attn_layer_norm_weight4, model_decoder_layers_0_encoder_attn_layer_norm_bias4, alloc1993)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias4)
        model_decoder_layers_0_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[501]
        model_decoder_layers_0_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[502]
        gv2601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2601, R.dtype("float16"))
        _1993: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight4, alloc1993, model_decoder_layers_0_encoder_attn_q_proj_bias4, alloc1994)
        R.vm.kill_object(alloc1993)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias4)
        gv2602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1039: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1994, gv2602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1994)
        gv2603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1040: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1039, gv2603, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1039)
        gv2604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc1995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2604, R.dtype("float16"))
        _1994: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1040, alloc1995)
        R.vm.kill_object(reshape1040)
        gv2605: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1041: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1995, gv2605, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc1995)
        gv2606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1042: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1041, gv2606, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1041)
        model_decoder_layers_0_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[503]
        model_decoder_layers_0_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[504]
        gv2607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2607, R.dtype("float16"))
        _1995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight4, reshape1042, model_decoder_layers_0_encoder_attn_out_proj_bias4, alloc1996)
        R.vm.kill_object(reshape1042)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias4)
        gv2608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2608, R.dtype("float16"))
        cls.add5(alloc1992, alloc1996, alloc1997)
        R.vm.kill_object(alloc1992)
        R.vm.kill_object(alloc1996)
        model_decoder_layers_0_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[511]
        model_decoder_layers_0_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[512]
        gv2609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc1998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2609, R.dtype("float16"))
        cls.layer_norm2(alloc1997, model_decoder_layers_0_final_layer_norm_weight4, model_decoder_layers_0_final_layer_norm_bias4, alloc1998)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias4)
        model_decoder_layers_0_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[507]
        model_decoder_layers_0_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[508]
        gv2610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc1999: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2610, R.dtype("float16"))
        _1998: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight4, alloc1998, model_decoder_layers_0_fc1_bias4, alloc1999)
        R.vm.kill_object(alloc1998)
        R.vm.kill_object(model_decoder_layers_0_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_0_fc1_bias4)
        model_decoder_layers_0_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[509]
        model_decoder_layers_0_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[510]
        gv2611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2611, R.dtype("float16"))
        _1999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight4, alloc1999, model_decoder_layers_0_fc2_bias4, alloc2000)
        R.vm.kill_object(alloc1999)
        R.vm.kill_object(model_decoder_layers_0_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_0_fc2_bias4)
        gv2612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2612, R.dtype("float16"))
        cls.add5(alloc1997, alloc2000, alloc2001)
        R.vm.kill_object(alloc1997)
        R.vm.kill_object(alloc2000)
        model_decoder_layers_1_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[520]
        model_decoder_layers_1_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[521]
        gv2613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2613, R.dtype("float16"))
        cls.layer_norm2(alloc2001, model_decoder_layers_1_self_attn_layer_norm_weight4, model_decoder_layers_1_self_attn_layer_norm_bias4, alloc2002)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias4)
        model_decoder_layers_1_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[516]
        model_decoder_layers_1_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[517]
        gv2614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2614, R.dtype("float16"))
        _2002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_q_proj_bias4, alloc2003)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias4)
        gv2615: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1043: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2003, gv2615, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2003)
        model_decoder_layers_1_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[513]
        gv2616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2616, R.dtype("float16"))
        _2003: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight4, alloc2002, alloc2004)
        R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight4)
        gv2617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1044: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2004, gv2617, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2004)
        model_decoder_layers_1_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[514]
        model_decoder_layers_1_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[515]
        gv2618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2618, R.dtype("float16"))
        _2004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_v_proj_bias4, alloc2005)
        R.vm.kill_object(alloc2002)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias4)
        gv2619: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1045: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2005, gv2619, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2005)
        gv2620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2006: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2620, R.dtype("float16"))
        cls.concatenate1(reshape1043, reshape1044, reshape1045, alloc2006)
        R.vm.kill_object(reshape1043)
        R.vm.kill_object(reshape1044)
        R.vm.kill_object(reshape1045)
        gv2621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1046: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2006, gv2621, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2006)
        gv2622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2622, R.dtype("float16"))
        _2006: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1046, alloc2007)
        R.vm.kill_object(reshape1046)
        gv2623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1047: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2007, gv2623, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2007)
        gv2624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1048: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1047, gv2624, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1047)
        model_decoder_layers_1_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[518]
        model_decoder_layers_1_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[519]
        gv2625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2625, R.dtype("float16"))
        _2007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight4, reshape1048, model_decoder_layers_1_self_attn_out_proj_bias4, alloc2008)
        R.vm.kill_object(reshape1048)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias4)
        gv2626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2626, R.dtype("float16"))
        cls.add5(alloc2001, alloc2008, alloc2009)
        R.vm.kill_object(alloc2001)
        R.vm.kill_object(alloc2008)
        model_decoder_layers_1_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[529]
        model_decoder_layers_1_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[530]
        gv2627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2627, R.dtype("float16"))
        cls.layer_norm2(alloc2009, model_decoder_layers_1_encoder_attn_layer_norm_weight4, model_decoder_layers_1_encoder_attn_layer_norm_bias4, alloc2010)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias4)
        model_decoder_layers_1_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[525]
        model_decoder_layers_1_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[526]
        gv2628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2628, R.dtype("float16"))
        _2010: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight4, alloc2010, model_decoder_layers_1_encoder_attn_q_proj_bias4, alloc2011)
        R.vm.kill_object(alloc2010)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias4)
        gv2629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1049: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2011, gv2629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2011)
        gv2630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1050: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1049, gv2630, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1049)
        gv2631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2631, R.dtype("float16"))
        _2011: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1050, alloc2012)
        R.vm.kill_object(reshape1050)
        gv2632: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1051: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2012, gv2632, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2012)
        gv2633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1052: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1051, gv2633, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1051)
        model_decoder_layers_1_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[527]
        model_decoder_layers_1_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[528]
        gv2634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2634, R.dtype("float16"))
        _2012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight4, reshape1052, model_decoder_layers_1_encoder_attn_out_proj_bias4, alloc2013)
        R.vm.kill_object(reshape1052)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias4)
        gv2635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2635, R.dtype("float16"))
        cls.add5(alloc2009, alloc2013, alloc2014)
        R.vm.kill_object(alloc2009)
        R.vm.kill_object(alloc2013)
        model_decoder_layers_1_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[535]
        model_decoder_layers_1_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[536]
        gv2636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2636, R.dtype("float16"))
        cls.layer_norm2(alloc2014, model_decoder_layers_1_final_layer_norm_weight4, model_decoder_layers_1_final_layer_norm_bias4, alloc2015)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias4)
        model_decoder_layers_1_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[531]
        model_decoder_layers_1_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[532]
        gv2637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2016: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2637, R.dtype("float16"))
        _2015: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight4, alloc2015, model_decoder_layers_1_fc1_bias4, alloc2016)
        R.vm.kill_object(alloc2015)
        R.vm.kill_object(model_decoder_layers_1_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_1_fc1_bias4)
        model_decoder_layers_1_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[533]
        model_decoder_layers_1_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[534]
        gv2638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2638, R.dtype("float16"))
        _2016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight4, alloc2016, model_decoder_layers_1_fc2_bias4, alloc2017)
        R.vm.kill_object(alloc2016)
        R.vm.kill_object(model_decoder_layers_1_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_1_fc2_bias4)
        gv2639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2639, R.dtype("float16"))
        cls.add5(alloc2014, alloc2017, alloc2018)
        R.vm.kill_object(alloc2014)
        R.vm.kill_object(alloc2017)
        model_decoder_layers_2_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[544]
        model_decoder_layers_2_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[545]
        gv2640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2640, R.dtype("float16"))
        cls.layer_norm2(alloc2018, model_decoder_layers_2_self_attn_layer_norm_weight4, model_decoder_layers_2_self_attn_layer_norm_bias4, alloc2019)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias4)
        model_decoder_layers_2_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[540]
        model_decoder_layers_2_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[541]
        gv2641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2641, R.dtype("float16"))
        _2019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_q_proj_bias4, alloc2020)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias4)
        gv2642: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1053: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2020, gv2642, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2020)
        model_decoder_layers_2_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[537]
        gv2643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2643, R.dtype("float16"))
        _2020: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight4, alloc2019, alloc2021)
        R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight4)
        gv2644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1054: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2021, gv2644, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2021)
        model_decoder_layers_2_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[538]
        model_decoder_layers_2_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[539]
        gv2645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2645, R.dtype("float16"))
        _2021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_v_proj_bias4, alloc2022)
        R.vm.kill_object(alloc2019)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias4)
        gv2646: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1055: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2022, gv2646, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2022)
        gv2647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2023: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2647, R.dtype("float16"))
        cls.concatenate1(reshape1053, reshape1054, reshape1055, alloc2023)
        R.vm.kill_object(reshape1053)
        R.vm.kill_object(reshape1054)
        R.vm.kill_object(reshape1055)
        gv2648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1056: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2023, gv2648, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2023)
        gv2649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2649, R.dtype("float16"))
        _2023: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1056, alloc2024)
        R.vm.kill_object(reshape1056)
        gv2650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1057: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2024, gv2650, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2024)
        gv2651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1058: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1057, gv2651, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1057)
        model_decoder_layers_2_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[542]
        model_decoder_layers_2_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[543]
        gv2652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2652, R.dtype("float16"))
        _2024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight4, reshape1058, model_decoder_layers_2_self_attn_out_proj_bias4, alloc2025)
        R.vm.kill_object(reshape1058)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias4)
        gv2653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2653, R.dtype("float16"))
        cls.add5(alloc2018, alloc2025, alloc2026)
        R.vm.kill_object(alloc2018)
        R.vm.kill_object(alloc2025)
        model_decoder_layers_2_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[553]
        model_decoder_layers_2_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[554]
        gv2654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2654, R.dtype("float16"))
        cls.layer_norm2(alloc2026, model_decoder_layers_2_encoder_attn_layer_norm_weight4, model_decoder_layers_2_encoder_attn_layer_norm_bias4, alloc2027)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias4)
        model_decoder_layers_2_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[549]
        model_decoder_layers_2_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[550]
        gv2655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2655, R.dtype("float16"))
        _2027: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight4, alloc2027, model_decoder_layers_2_encoder_attn_q_proj_bias4, alloc2028)
        R.vm.kill_object(alloc2027)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias4)
        gv2656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1059: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2028, gv2656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2028)
        gv2657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1060: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1059, gv2657, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1059)
        gv2658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2658, R.dtype("float16"))
        _2028: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1060, alloc2029)
        R.vm.kill_object(reshape1060)
        gv2659: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1061: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2029, gv2659, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2029)
        gv2660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1062: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1061, gv2660, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1061)
        model_decoder_layers_2_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[551]
        model_decoder_layers_2_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[552]
        gv2661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2661, R.dtype("float16"))
        _2029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight4, reshape1062, model_decoder_layers_2_encoder_attn_out_proj_bias4, alloc2030)
        R.vm.kill_object(reshape1062)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias4)
        gv2662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2662, R.dtype("float16"))
        cls.add5(alloc2026, alloc2030, alloc2031)
        R.vm.kill_object(alloc2026)
        R.vm.kill_object(alloc2030)
        model_decoder_layers_2_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[559]
        model_decoder_layers_2_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[560]
        gv2663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2663, R.dtype("float16"))
        cls.layer_norm2(alloc2031, model_decoder_layers_2_final_layer_norm_weight4, model_decoder_layers_2_final_layer_norm_bias4, alloc2032)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias4)
        model_decoder_layers_2_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[555]
        model_decoder_layers_2_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[556]
        gv2664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2033: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2664, R.dtype("float16"))
        _2032: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight4, alloc2032, model_decoder_layers_2_fc1_bias4, alloc2033)
        R.vm.kill_object(alloc2032)
        R.vm.kill_object(model_decoder_layers_2_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_2_fc1_bias4)
        model_decoder_layers_2_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[557]
        model_decoder_layers_2_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[558]
        gv2665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2665, R.dtype("float16"))
        _2033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight4, alloc2033, model_decoder_layers_2_fc2_bias4, alloc2034)
        R.vm.kill_object(alloc2033)
        R.vm.kill_object(model_decoder_layers_2_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_2_fc2_bias4)
        gv2666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2666, R.dtype("float16"))
        cls.add5(alloc2031, alloc2034, alloc2035)
        R.vm.kill_object(alloc2031)
        R.vm.kill_object(alloc2034)
        model_decoder_layers_3_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[568]
        model_decoder_layers_3_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[569]
        gv2667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2667, R.dtype("float16"))
        cls.layer_norm2(alloc2035, model_decoder_layers_3_self_attn_layer_norm_weight4, model_decoder_layers_3_self_attn_layer_norm_bias4, alloc2036)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias4)
        model_decoder_layers_3_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[564]
        model_decoder_layers_3_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[565]
        gv2668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2668, R.dtype("float16"))
        _2036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_q_proj_bias4, alloc2037)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias4)
        gv2669: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1063: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2037, gv2669, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2037)
        model_decoder_layers_3_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[561]
        gv2670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2670, R.dtype("float16"))
        _2037: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight4, alloc2036, alloc2038)
        R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight4)
        gv2671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1064: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2038, gv2671, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2038)
        model_decoder_layers_3_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[562]
        model_decoder_layers_3_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[563]
        gv2672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2672, R.dtype("float16"))
        _2038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_v_proj_bias4, alloc2039)
        R.vm.kill_object(alloc2036)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias4)
        gv2673: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1065: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2039, gv2673, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2039)
        gv2674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2040: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2674, R.dtype("float16"))
        cls.concatenate1(reshape1063, reshape1064, reshape1065, alloc2040)
        R.vm.kill_object(reshape1063)
        R.vm.kill_object(reshape1064)
        R.vm.kill_object(reshape1065)
        gv2675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1066: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2040, gv2675, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2040)
        gv2676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2676, R.dtype("float16"))
        _2040: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1066, alloc2041)
        R.vm.kill_object(reshape1066)
        gv2677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1067: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2041, gv2677, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2041)
        gv2678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1068: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1067, gv2678, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1067)
        model_decoder_layers_3_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[566]
        model_decoder_layers_3_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[567]
        gv2679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2679, R.dtype("float16"))
        _2041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight4, reshape1068, model_decoder_layers_3_self_attn_out_proj_bias4, alloc2042)
        R.vm.kill_object(reshape1068)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias4)
        gv2680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2680, R.dtype("float16"))
        cls.add5(alloc2035, alloc2042, alloc2043)
        R.vm.kill_object(alloc2035)
        R.vm.kill_object(alloc2042)
        model_decoder_layers_3_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[577]
        model_decoder_layers_3_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[578]
        gv2681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2681, R.dtype("float16"))
        cls.layer_norm2(alloc2043, model_decoder_layers_3_encoder_attn_layer_norm_weight4, model_decoder_layers_3_encoder_attn_layer_norm_bias4, alloc2044)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias4)
        model_decoder_layers_3_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[573]
        model_decoder_layers_3_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[574]
        gv2682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2682, R.dtype("float16"))
        _2044: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight4, alloc2044, model_decoder_layers_3_encoder_attn_q_proj_bias4, alloc2045)
        R.vm.kill_object(alloc2044)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias4)
        gv2683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1069: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2045, gv2683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2045)
        gv2684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1070: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1069, gv2684, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1069)
        gv2685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2685, R.dtype("float16"))
        _2045: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1070, alloc2046)
        R.vm.kill_object(reshape1070)
        gv2686: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1071: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2046, gv2686, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2046)
        gv2687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1072: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1071, gv2687, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1071)
        model_decoder_layers_3_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[575]
        model_decoder_layers_3_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[576]
        gv2688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2688, R.dtype("float16"))
        _2046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight4, reshape1072, model_decoder_layers_3_encoder_attn_out_proj_bias4, alloc2047)
        R.vm.kill_object(reshape1072)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias4)
        gv2689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2689, R.dtype("float16"))
        cls.add5(alloc2043, alloc2047, alloc2048)
        R.vm.kill_object(alloc2043)
        R.vm.kill_object(alloc2047)
        model_decoder_layers_3_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[583]
        model_decoder_layers_3_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[584]
        gv2690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2690, R.dtype("float16"))
        cls.layer_norm2(alloc2048, model_decoder_layers_3_final_layer_norm_weight4, model_decoder_layers_3_final_layer_norm_bias4, alloc2049)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias4)
        model_decoder_layers_3_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[579]
        model_decoder_layers_3_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[580]
        gv2691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2050: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2691, R.dtype("float16"))
        _2049: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight4, alloc2049, model_decoder_layers_3_fc1_bias4, alloc2050)
        R.vm.kill_object(alloc2049)
        R.vm.kill_object(model_decoder_layers_3_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_3_fc1_bias4)
        model_decoder_layers_3_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[581]
        model_decoder_layers_3_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[582]
        gv2692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2692, R.dtype("float16"))
        _2050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight4, alloc2050, model_decoder_layers_3_fc2_bias4, alloc2051)
        R.vm.kill_object(alloc2050)
        R.vm.kill_object(model_decoder_layers_3_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_3_fc2_bias4)
        gv2693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2693, R.dtype("float16"))
        cls.add5(alloc2048, alloc2051, alloc2052)
        R.vm.kill_object(alloc2048)
        R.vm.kill_object(alloc2051)
        model_decoder_layers_4_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[592]
        model_decoder_layers_4_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[593]
        gv2694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2694, R.dtype("float16"))
        cls.layer_norm2(alloc2052, model_decoder_layers_4_self_attn_layer_norm_weight4, model_decoder_layers_4_self_attn_layer_norm_bias4, alloc2053)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias4)
        model_decoder_layers_4_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[588]
        model_decoder_layers_4_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[589]
        gv2695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2695, R.dtype("float16"))
        _2053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_q_proj_bias4, alloc2054)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias4)
        gv2696: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1073: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2054, gv2696, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2054)
        model_decoder_layers_4_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[585]
        gv2697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2697, R.dtype("float16"))
        _2054: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight4, alloc2053, alloc2055)
        R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight4)
        gv2698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1074: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2055, gv2698, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2055)
        model_decoder_layers_4_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[586]
        model_decoder_layers_4_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[587]
        gv2699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2699, R.dtype("float16"))
        _2055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_v_proj_bias4, alloc2056)
        R.vm.kill_object(alloc2053)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias4)
        gv2700: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1075: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2056, gv2700, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2056)
        gv2701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2057: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2701, R.dtype("float16"))
        cls.concatenate1(reshape1073, reshape1074, reshape1075, alloc2057)
        R.vm.kill_object(reshape1073)
        R.vm.kill_object(reshape1074)
        R.vm.kill_object(reshape1075)
        gv2702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1076: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2057, gv2702, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2057)
        gv2703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2703, R.dtype("float16"))
        _2057: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1076, alloc2058)
        R.vm.kill_object(reshape1076)
        gv2704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1077: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2058, gv2704, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2058)
        gv2705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1078: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1077, gv2705, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1077)
        model_decoder_layers_4_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[590]
        model_decoder_layers_4_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[591]
        gv2706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2706, R.dtype("float16"))
        _2058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight4, reshape1078, model_decoder_layers_4_self_attn_out_proj_bias4, alloc2059)
        R.vm.kill_object(reshape1078)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias4)
        gv2707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2707, R.dtype("float16"))
        cls.add5(alloc2052, alloc2059, alloc2060)
        R.vm.kill_object(alloc2052)
        R.vm.kill_object(alloc2059)
        model_decoder_layers_4_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[601]
        model_decoder_layers_4_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[602]
        gv2708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2708, R.dtype("float16"))
        cls.layer_norm2(alloc2060, model_decoder_layers_4_encoder_attn_layer_norm_weight4, model_decoder_layers_4_encoder_attn_layer_norm_bias4, alloc2061)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias4)
        model_decoder_layers_4_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[597]
        model_decoder_layers_4_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[598]
        gv2709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2709, R.dtype("float16"))
        _2061: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight4, alloc2061, model_decoder_layers_4_encoder_attn_q_proj_bias4, alloc2062)
        R.vm.kill_object(alloc2061)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias4)
        gv2710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1079: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2062, gv2710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2062)
        gv2711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1080: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1079, gv2711, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1079)
        gv2712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2712, R.dtype("float16"))
        _2062: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1080, alloc2063)
        R.vm.kill_object(reshape1080)
        gv2713: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1081: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2063, gv2713, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2063)
        gv2714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1082: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1081, gv2714, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1081)
        model_decoder_layers_4_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[599]
        model_decoder_layers_4_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[600]
        gv2715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2715, R.dtype("float16"))
        _2063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight4, reshape1082, model_decoder_layers_4_encoder_attn_out_proj_bias4, alloc2064)
        R.vm.kill_object(reshape1082)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias4)
        gv2716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2716, R.dtype("float16"))
        cls.add5(alloc2060, alloc2064, alloc2065)
        R.vm.kill_object(alloc2060)
        R.vm.kill_object(alloc2064)
        model_decoder_layers_4_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[607]
        model_decoder_layers_4_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[608]
        gv2717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2717, R.dtype("float16"))
        cls.layer_norm2(alloc2065, model_decoder_layers_4_final_layer_norm_weight4, model_decoder_layers_4_final_layer_norm_bias4, alloc2066)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias4)
        model_decoder_layers_4_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[603]
        model_decoder_layers_4_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[604]
        gv2718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2067: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2718, R.dtype("float16"))
        _2066: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight4, alloc2066, model_decoder_layers_4_fc1_bias4, alloc2067)
        R.vm.kill_object(alloc2066)
        R.vm.kill_object(model_decoder_layers_4_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_4_fc1_bias4)
        model_decoder_layers_4_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[605]
        model_decoder_layers_4_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[606]
        gv2719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2719, R.dtype("float16"))
        _2067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight4, alloc2067, model_decoder_layers_4_fc2_bias4, alloc2068)
        R.vm.kill_object(alloc2067)
        R.vm.kill_object(model_decoder_layers_4_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_4_fc2_bias4)
        gv2720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2720, R.dtype("float16"))
        cls.add5(alloc2065, alloc2068, alloc2069)
        R.vm.kill_object(alloc2065)
        R.vm.kill_object(alloc2068)
        model_decoder_layers_5_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[616]
        model_decoder_layers_5_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[617]
        gv2721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2721, R.dtype("float16"))
        cls.layer_norm2(alloc2069, model_decoder_layers_5_self_attn_layer_norm_weight4, model_decoder_layers_5_self_attn_layer_norm_bias4, alloc2070)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias4)
        model_decoder_layers_5_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[612]
        model_decoder_layers_5_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[613]
        gv2722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2722, R.dtype("float16"))
        _2070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_q_proj_bias4, alloc2071)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias4)
        gv2723: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1083: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2071, gv2723, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2071)
        model_decoder_layers_5_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[609]
        gv2724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2724, R.dtype("float16"))
        _2071: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight4, alloc2070, alloc2072)
        R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight4)
        gv2725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1084: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2072, gv2725, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2072)
        model_decoder_layers_5_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[610]
        model_decoder_layers_5_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[611]
        gv2726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2726, R.dtype("float16"))
        _2072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_v_proj_bias4, alloc2073)
        R.vm.kill_object(alloc2070)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias4)
        gv2727: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1085: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2073, gv2727, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2073)
        gv2728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2074: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2728, R.dtype("float16"))
        cls.concatenate1(reshape1083, reshape1084, reshape1085, alloc2074)
        R.vm.kill_object(reshape1083)
        R.vm.kill_object(reshape1084)
        R.vm.kill_object(reshape1085)
        gv2729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1086: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2074, gv2729, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2074)
        gv2730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2730, R.dtype("float16"))
        _2074: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1086, alloc2075)
        R.vm.kill_object(reshape1086)
        gv2731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1087: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2075, gv2731, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2075)
        gv2732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1088: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1087, gv2732, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1087)
        model_decoder_layers_5_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[614]
        model_decoder_layers_5_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[615]
        gv2733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2733, R.dtype("float16"))
        _2075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight4, reshape1088, model_decoder_layers_5_self_attn_out_proj_bias4, alloc2076)
        R.vm.kill_object(reshape1088)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias4)
        gv2734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2734, R.dtype("float16"))
        cls.add5(alloc2069, alloc2076, alloc2077)
        R.vm.kill_object(alloc2069)
        R.vm.kill_object(alloc2076)
        model_decoder_layers_5_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[625]
        model_decoder_layers_5_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[626]
        gv2735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2735, R.dtype("float16"))
        cls.layer_norm2(alloc2077, model_decoder_layers_5_encoder_attn_layer_norm_weight4, model_decoder_layers_5_encoder_attn_layer_norm_bias4, alloc2078)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias4)
        model_decoder_layers_5_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[621]
        model_decoder_layers_5_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[622]
        gv2736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2736, R.dtype("float16"))
        _2078: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight4, alloc2078, model_decoder_layers_5_encoder_attn_q_proj_bias4, alloc2079)
        R.vm.kill_object(alloc2078)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias4)
        gv2737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1089: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2079, gv2737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2079)
        gv2738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1090: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1089, gv2738, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1089)
        gv2739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2739, R.dtype("float16"))
        _2079: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1090, alloc2080)
        R.vm.kill_object(reshape1090)
        gv2740: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1091: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2080, gv2740, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2080)
        gv2741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1092: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1091, gv2741, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1091)
        model_decoder_layers_5_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[623]
        model_decoder_layers_5_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[624]
        gv2742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2742, R.dtype("float16"))
        _2080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight4, reshape1092, model_decoder_layers_5_encoder_attn_out_proj_bias4, alloc2081)
        R.vm.kill_object(reshape1092)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias4)
        gv2743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2743, R.dtype("float16"))
        cls.add5(alloc2077, alloc2081, alloc2082)
        R.vm.kill_object(alloc2077)
        R.vm.kill_object(alloc2081)
        model_decoder_layers_5_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[631]
        model_decoder_layers_5_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[632]
        gv2744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2744, R.dtype("float16"))
        cls.layer_norm2(alloc2082, model_decoder_layers_5_final_layer_norm_weight4, model_decoder_layers_5_final_layer_norm_bias4, alloc2083)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias4)
        model_decoder_layers_5_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[627]
        model_decoder_layers_5_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[628]
        gv2745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2084: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2745, R.dtype("float16"))
        _2083: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight4, alloc2083, model_decoder_layers_5_fc1_bias4, alloc2084)
        R.vm.kill_object(alloc2083)
        R.vm.kill_object(model_decoder_layers_5_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_5_fc1_bias4)
        model_decoder_layers_5_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[629]
        model_decoder_layers_5_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[630]
        gv2746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2746, R.dtype("float16"))
        _2084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight4, alloc2084, model_decoder_layers_5_fc2_bias4, alloc2085)
        R.vm.kill_object(alloc2084)
        R.vm.kill_object(model_decoder_layers_5_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_5_fc2_bias4)
        gv2747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2747, R.dtype("float16"))
        cls.add5(alloc2082, alloc2085, alloc2086)
        R.vm.kill_object(alloc2082)
        R.vm.kill_object(alloc2085)
        model_decoder_layers_6_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[640]
        model_decoder_layers_6_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[641]
        gv2748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2748, R.dtype("float16"))
        cls.layer_norm2(alloc2086, model_decoder_layers_6_self_attn_layer_norm_weight4, model_decoder_layers_6_self_attn_layer_norm_bias4, alloc2087)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias4)
        model_decoder_layers_6_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[636]
        model_decoder_layers_6_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[637]
        gv2749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2749, R.dtype("float16"))
        _2087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_q_proj_bias4, alloc2088)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias4)
        gv2750: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1093: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2088, gv2750, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2088)
        model_decoder_layers_6_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[633]
        gv2751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2751, R.dtype("float16"))
        _2088: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight4, alloc2087, alloc2089)
        R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight4)
        gv2752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1094: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2089, gv2752, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2089)
        model_decoder_layers_6_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[634]
        model_decoder_layers_6_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[635]
        gv2753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2753, R.dtype("float16"))
        _2089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_v_proj_bias4, alloc2090)
        R.vm.kill_object(alloc2087)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias4)
        gv2754: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1095: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2090, gv2754, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2090)
        gv2755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2091: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2755, R.dtype("float16"))
        cls.concatenate1(reshape1093, reshape1094, reshape1095, alloc2091)
        R.vm.kill_object(reshape1093)
        R.vm.kill_object(reshape1094)
        R.vm.kill_object(reshape1095)
        gv2756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1096: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2091, gv2756, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2091)
        gv2757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2757, R.dtype("float16"))
        _2091: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1096, alloc2092)
        R.vm.kill_object(reshape1096)
        gv2758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1097: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2092, gv2758, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2092)
        gv2759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1098: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1097, gv2759, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1097)
        model_decoder_layers_6_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[638]
        model_decoder_layers_6_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[639]
        gv2760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2760, R.dtype("float16"))
        _2092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight4, reshape1098, model_decoder_layers_6_self_attn_out_proj_bias4, alloc2093)
        R.vm.kill_object(reshape1098)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias4)
        gv2761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2761, R.dtype("float16"))
        cls.add5(alloc2086, alloc2093, alloc2094)
        R.vm.kill_object(alloc2086)
        R.vm.kill_object(alloc2093)
        model_decoder_layers_6_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[649]
        model_decoder_layers_6_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[650]
        gv2762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2762, R.dtype("float16"))
        cls.layer_norm2(alloc2094, model_decoder_layers_6_encoder_attn_layer_norm_weight4, model_decoder_layers_6_encoder_attn_layer_norm_bias4, alloc2095)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias4)
        model_decoder_layers_6_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[645]
        model_decoder_layers_6_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[646]
        gv2763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2763, R.dtype("float16"))
        _2095: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight4, alloc2095, model_decoder_layers_6_encoder_attn_q_proj_bias4, alloc2096)
        R.vm.kill_object(alloc2095)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias4)
        gv2764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1099: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2096, gv2764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2096)
        gv2765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1100: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1099, gv2765, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1099)
        gv2766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2766, R.dtype("float16"))
        _2096: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1100, alloc2097)
        R.vm.kill_object(reshape1100)
        gv2767: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1101: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2097, gv2767, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2097)
        gv2768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1102: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1101, gv2768, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1101)
        model_decoder_layers_6_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[647]
        model_decoder_layers_6_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[648]
        gv2769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2769, R.dtype("float16"))
        _2097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight4, reshape1102, model_decoder_layers_6_encoder_attn_out_proj_bias4, alloc2098)
        R.vm.kill_object(reshape1102)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias4)
        gv2770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2770, R.dtype("float16"))
        cls.add5(alloc2094, alloc2098, alloc2099)
        R.vm.kill_object(alloc2094)
        R.vm.kill_object(alloc2098)
        model_decoder_layers_6_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[655]
        model_decoder_layers_6_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[656]
        gv2771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2771, R.dtype("float16"))
        cls.layer_norm2(alloc2099, model_decoder_layers_6_final_layer_norm_weight4, model_decoder_layers_6_final_layer_norm_bias4, alloc2100)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias4)
        model_decoder_layers_6_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[651]
        model_decoder_layers_6_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[652]
        gv2772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2772, R.dtype("float16"))
        _2100: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight4, alloc2100, model_decoder_layers_6_fc1_bias4, alloc2101)
        R.vm.kill_object(alloc2100)
        R.vm.kill_object(model_decoder_layers_6_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_6_fc1_bias4)
        model_decoder_layers_6_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[653]
        model_decoder_layers_6_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[654]
        gv2773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2773, R.dtype("float16"))
        _2101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight4, alloc2101, model_decoder_layers_6_fc2_bias4, alloc2102)
        R.vm.kill_object(alloc2101)
        R.vm.kill_object(model_decoder_layers_6_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_6_fc2_bias4)
        gv2774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2774, R.dtype("float16"))
        cls.add5(alloc2099, alloc2102, alloc2103)
        R.vm.kill_object(alloc2099)
        R.vm.kill_object(alloc2102)
        model_decoder_layers_7_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[664]
        model_decoder_layers_7_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[665]
        gv2775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2775, R.dtype("float16"))
        cls.layer_norm2(alloc2103, model_decoder_layers_7_self_attn_layer_norm_weight4, model_decoder_layers_7_self_attn_layer_norm_bias4, alloc2104)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias4)
        model_decoder_layers_7_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[660]
        model_decoder_layers_7_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[661]
        gv2776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2776, R.dtype("float16"))
        _2104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_q_proj_bias4, alloc2105)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias4)
        gv2777: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1103: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2105, gv2777, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2105)
        model_decoder_layers_7_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[657]
        gv2778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2778, R.dtype("float16"))
        _2105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight4, alloc2104, alloc2106)
        R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight4)
        gv2779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1104: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2106, gv2779, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2106)
        model_decoder_layers_7_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[658]
        model_decoder_layers_7_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[659]
        gv2780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2780, R.dtype("float16"))
        _2106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_v_proj_bias4, alloc2107)
        R.vm.kill_object(alloc2104)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias4)
        gv2781: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1105: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2107, gv2781, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2107)
        gv2782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2108: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2782, R.dtype("float16"))
        cls.concatenate1(reshape1103, reshape1104, reshape1105, alloc2108)
        R.vm.kill_object(reshape1103)
        R.vm.kill_object(reshape1104)
        R.vm.kill_object(reshape1105)
        gv2783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1106: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2108, gv2783, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2108)
        gv2784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2784, R.dtype("float16"))
        _2108: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1106, alloc2109)
        R.vm.kill_object(reshape1106)
        gv2785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1107: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2109, gv2785, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2109)
        gv2786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1108: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1107, gv2786, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1107)
        model_decoder_layers_7_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[662]
        model_decoder_layers_7_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[663]
        gv2787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2787, R.dtype("float16"))
        _2109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight4, reshape1108, model_decoder_layers_7_self_attn_out_proj_bias4, alloc2110)
        R.vm.kill_object(reshape1108)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias4)
        gv2788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2788, R.dtype("float16"))
        cls.add5(alloc2103, alloc2110, alloc2111)
        R.vm.kill_object(alloc2103)
        R.vm.kill_object(alloc2110)
        model_decoder_layers_7_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[673]
        model_decoder_layers_7_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[674]
        gv2789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2789, R.dtype("float16"))
        cls.layer_norm2(alloc2111, model_decoder_layers_7_encoder_attn_layer_norm_weight4, model_decoder_layers_7_encoder_attn_layer_norm_bias4, alloc2112)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias4)
        model_decoder_layers_7_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[669]
        model_decoder_layers_7_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[670]
        gv2790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2790, R.dtype("float16"))
        _2112: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight4, alloc2112, model_decoder_layers_7_encoder_attn_q_proj_bias4, alloc2113)
        R.vm.kill_object(alloc2112)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias4)
        gv2791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1109: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2113, gv2791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2113)
        gv2792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1110: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1109, gv2792, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1109)
        gv2793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2793, R.dtype("float16"))
        _2113: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1110, alloc2114)
        R.vm.kill_object(reshape1110)
        gv2794: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1111: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2114, gv2794, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2114)
        gv2795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1112: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1111, gv2795, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1111)
        model_decoder_layers_7_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[671]
        model_decoder_layers_7_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[672]
        gv2796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2796, R.dtype("float16"))
        _2114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight4, reshape1112, model_decoder_layers_7_encoder_attn_out_proj_bias4, alloc2115)
        R.vm.kill_object(reshape1112)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias4)
        gv2797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2797, R.dtype("float16"))
        cls.add5(alloc2111, alloc2115, alloc2116)
        R.vm.kill_object(alloc2111)
        R.vm.kill_object(alloc2115)
        model_decoder_layers_7_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[679]
        model_decoder_layers_7_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[680]
        gv2798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2798, R.dtype("float16"))
        cls.layer_norm2(alloc2116, model_decoder_layers_7_final_layer_norm_weight4, model_decoder_layers_7_final_layer_norm_bias4, alloc2117)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias4)
        model_decoder_layers_7_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[675]
        model_decoder_layers_7_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[676]
        gv2799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2799, R.dtype("float16"))
        _2117: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight4, alloc2117, model_decoder_layers_7_fc1_bias4, alloc2118)
        R.vm.kill_object(alloc2117)
        R.vm.kill_object(model_decoder_layers_7_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_7_fc1_bias4)
        model_decoder_layers_7_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[677]
        model_decoder_layers_7_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[678]
        gv2800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2800, R.dtype("float16"))
        _2118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight4, alloc2118, model_decoder_layers_7_fc2_bias4, alloc2119)
        R.vm.kill_object(alloc2118)
        R.vm.kill_object(model_decoder_layers_7_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_7_fc2_bias4)
        gv2801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2801, R.dtype("float16"))
        cls.add5(alloc2116, alloc2119, alloc2120)
        R.vm.kill_object(alloc2116)
        R.vm.kill_object(alloc2119)
        model_decoder_layers_8_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[688]
        model_decoder_layers_8_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[689]
        gv2802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2802, R.dtype("float16"))
        cls.layer_norm2(alloc2120, model_decoder_layers_8_self_attn_layer_norm_weight4, model_decoder_layers_8_self_attn_layer_norm_bias4, alloc2121)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias4)
        model_decoder_layers_8_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[684]
        model_decoder_layers_8_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[685]
        gv2803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2803, R.dtype("float16"))
        _2121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_q_proj_bias4, alloc2122)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias4)
        gv2804: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1113: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2122, gv2804, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2122)
        model_decoder_layers_8_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[681]
        gv2805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2805, R.dtype("float16"))
        _2122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight4, alloc2121, alloc2123)
        R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight4)
        gv2806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1114: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2123, gv2806, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2123)
        model_decoder_layers_8_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[682]
        model_decoder_layers_8_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[683]
        gv2807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2807, R.dtype("float16"))
        _2123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_v_proj_bias4, alloc2124)
        R.vm.kill_object(alloc2121)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias4)
        gv2808: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1115: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2124, gv2808, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2124)
        gv2809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2125: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2809, R.dtype("float16"))
        cls.concatenate1(reshape1113, reshape1114, reshape1115, alloc2125)
        R.vm.kill_object(reshape1113)
        R.vm.kill_object(reshape1114)
        R.vm.kill_object(reshape1115)
        gv2810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1116: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2125, gv2810, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2125)
        gv2811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2811, R.dtype("float16"))
        _2125: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1116, alloc2126)
        R.vm.kill_object(reshape1116)
        gv2812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1117: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2126, gv2812, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2126)
        gv2813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1118: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1117, gv2813, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1117)
        model_decoder_layers_8_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[686]
        model_decoder_layers_8_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[687]
        gv2814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2814, R.dtype("float16"))
        _2126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight4, reshape1118, model_decoder_layers_8_self_attn_out_proj_bias4, alloc2127)
        R.vm.kill_object(reshape1118)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias4)
        gv2815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2815, R.dtype("float16"))
        cls.add5(alloc2120, alloc2127, alloc2128)
        R.vm.kill_object(alloc2120)
        R.vm.kill_object(alloc2127)
        model_decoder_layers_8_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[697]
        model_decoder_layers_8_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[698]
        gv2816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2816, R.dtype("float16"))
        cls.layer_norm2(alloc2128, model_decoder_layers_8_encoder_attn_layer_norm_weight4, model_decoder_layers_8_encoder_attn_layer_norm_bias4, alloc2129)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias4)
        model_decoder_layers_8_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[693]
        model_decoder_layers_8_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[694]
        gv2817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2817, R.dtype("float16"))
        _2129: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight4, alloc2129, model_decoder_layers_8_encoder_attn_q_proj_bias4, alloc2130)
        R.vm.kill_object(alloc2129)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias4)
        gv2818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1119: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2130, gv2818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2130)
        gv2819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1120: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1119, gv2819, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1119)
        gv2820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2820, R.dtype("float16"))
        _2130: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1120, alloc2131)
        R.vm.kill_object(reshape1120)
        gv2821: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1121: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2131, gv2821, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2131)
        gv2822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1122: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1121, gv2822, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1121)
        model_decoder_layers_8_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[695]
        model_decoder_layers_8_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[696]
        gv2823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2823, R.dtype("float16"))
        _2131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight4, reshape1122, model_decoder_layers_8_encoder_attn_out_proj_bias4, alloc2132)
        R.vm.kill_object(reshape1122)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias4)
        gv2824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2824, R.dtype("float16"))
        cls.add5(alloc2128, alloc2132, alloc2133)
        R.vm.kill_object(alloc2128)
        R.vm.kill_object(alloc2132)
        model_decoder_layers_8_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[703]
        model_decoder_layers_8_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[704]
        gv2825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2825, R.dtype("float16"))
        cls.layer_norm2(alloc2133, model_decoder_layers_8_final_layer_norm_weight4, model_decoder_layers_8_final_layer_norm_bias4, alloc2134)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias4)
        model_decoder_layers_8_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[699]
        model_decoder_layers_8_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[700]
        gv2826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2826, R.dtype("float16"))
        _2134: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight4, alloc2134, model_decoder_layers_8_fc1_bias4, alloc2135)
        R.vm.kill_object(alloc2134)
        R.vm.kill_object(model_decoder_layers_8_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_8_fc1_bias4)
        model_decoder_layers_8_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[701]
        model_decoder_layers_8_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[702]
        gv2827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2827, R.dtype("float16"))
        _2135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight4, alloc2135, model_decoder_layers_8_fc2_bias4, alloc2136)
        R.vm.kill_object(alloc2135)
        R.vm.kill_object(model_decoder_layers_8_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_8_fc2_bias4)
        gv2828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2828, R.dtype("float16"))
        cls.add5(alloc2133, alloc2136, alloc2137)
        R.vm.kill_object(alloc2133)
        R.vm.kill_object(alloc2136)
        model_decoder_layers_9_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[712]
        model_decoder_layers_9_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[713]
        gv2829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2829, R.dtype("float16"))
        cls.layer_norm2(alloc2137, model_decoder_layers_9_self_attn_layer_norm_weight4, model_decoder_layers_9_self_attn_layer_norm_bias4, alloc2138)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias4)
        model_decoder_layers_9_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[708]
        model_decoder_layers_9_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[709]
        gv2830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2830, R.dtype("float16"))
        _2138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_q_proj_bias4, alloc2139)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias4)
        gv2831: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1123: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2139, gv2831, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2139)
        model_decoder_layers_9_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[705]
        gv2832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2832, R.dtype("float16"))
        _2139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight4, alloc2138, alloc2140)
        R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight4)
        gv2833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1124: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2140, gv2833, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2140)
        model_decoder_layers_9_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[706]
        model_decoder_layers_9_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[707]
        gv2834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2834, R.dtype("float16"))
        _2140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_v_proj_bias4, alloc2141)
        R.vm.kill_object(alloc2138)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias4)
        gv2835: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1125: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2141, gv2835, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2141)
        gv2836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2142: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2836, R.dtype("float16"))
        cls.concatenate1(reshape1123, reshape1124, reshape1125, alloc2142)
        R.vm.kill_object(reshape1123)
        R.vm.kill_object(reshape1124)
        R.vm.kill_object(reshape1125)
        gv2837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1126: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2142, gv2837, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2142)
        gv2838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2838, R.dtype("float16"))
        _2142: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1126, alloc2143)
        R.vm.kill_object(reshape1126)
        gv2839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1127: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2143, gv2839, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2143)
        gv2840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1128: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1127, gv2840, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1127)
        model_decoder_layers_9_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[710]
        model_decoder_layers_9_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[711]
        gv2841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2841, R.dtype("float16"))
        _2143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight4, reshape1128, model_decoder_layers_9_self_attn_out_proj_bias4, alloc2144)
        R.vm.kill_object(reshape1128)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias4)
        gv2842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2842, R.dtype("float16"))
        cls.add5(alloc2137, alloc2144, alloc2145)
        R.vm.kill_object(alloc2137)
        R.vm.kill_object(alloc2144)
        model_decoder_layers_9_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[721]
        model_decoder_layers_9_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[722]
        gv2843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2843, R.dtype("float16"))
        cls.layer_norm2(alloc2145, model_decoder_layers_9_encoder_attn_layer_norm_weight4, model_decoder_layers_9_encoder_attn_layer_norm_bias4, alloc2146)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias4)
        model_decoder_layers_9_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[717]
        model_decoder_layers_9_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[718]
        gv2844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2844, R.dtype("float16"))
        _2146: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight4, alloc2146, model_decoder_layers_9_encoder_attn_q_proj_bias4, alloc2147)
        R.vm.kill_object(alloc2146)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias4)
        gv2845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1129: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2147, gv2845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2147)
        gv2846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1130: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1129, gv2846, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1129)
        gv2847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2847, R.dtype("float16"))
        _2147: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1130, alloc2148)
        R.vm.kill_object(reshape1130)
        gv2848: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1131: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2148, gv2848, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2148)
        gv2849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1132: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1131, gv2849, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1131)
        model_decoder_layers_9_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[719]
        model_decoder_layers_9_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[720]
        gv2850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2850, R.dtype("float16"))
        _2148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight4, reshape1132, model_decoder_layers_9_encoder_attn_out_proj_bias4, alloc2149)
        R.vm.kill_object(reshape1132)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias4)
        gv2851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2851, R.dtype("float16"))
        cls.add5(alloc2145, alloc2149, alloc2150)
        R.vm.kill_object(alloc2145)
        R.vm.kill_object(alloc2149)
        model_decoder_layers_9_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[727]
        model_decoder_layers_9_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[728]
        gv2852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2852, R.dtype("float16"))
        cls.layer_norm2(alloc2150, model_decoder_layers_9_final_layer_norm_weight4, model_decoder_layers_9_final_layer_norm_bias4, alloc2151)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias4)
        model_decoder_layers_9_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[723]
        model_decoder_layers_9_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[724]
        gv2853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2853, R.dtype("float16"))
        _2151: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight4, alloc2151, model_decoder_layers_9_fc1_bias4, alloc2152)
        R.vm.kill_object(alloc2151)
        R.vm.kill_object(model_decoder_layers_9_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_9_fc1_bias4)
        model_decoder_layers_9_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[725]
        model_decoder_layers_9_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[726]
        gv2854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2854, R.dtype("float16"))
        _2152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight4, alloc2152, model_decoder_layers_9_fc2_bias4, alloc2153)
        R.vm.kill_object(alloc2152)
        R.vm.kill_object(model_decoder_layers_9_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_9_fc2_bias4)
        gv2855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2855, R.dtype("float16"))
        cls.add5(alloc2150, alloc2153, alloc2154)
        R.vm.kill_object(alloc2150)
        R.vm.kill_object(alloc2153)
        model_decoder_layers_10_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[736]
        model_decoder_layers_10_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[737]
        gv2856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2856, R.dtype("float16"))
        cls.layer_norm2(alloc2154, model_decoder_layers_10_self_attn_layer_norm_weight4, model_decoder_layers_10_self_attn_layer_norm_bias4, alloc2155)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias4)
        model_decoder_layers_10_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[732]
        model_decoder_layers_10_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[733]
        gv2857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2857, R.dtype("float16"))
        _2155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_q_proj_bias4, alloc2156)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias4)
        gv2858: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1133: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2156, gv2858, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2156)
        model_decoder_layers_10_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[729]
        gv2859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2859, R.dtype("float16"))
        _2156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight4, alloc2155, alloc2157)
        R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight4)
        gv2860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1134: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2157, gv2860, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2157)
        model_decoder_layers_10_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[730]
        model_decoder_layers_10_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[731]
        gv2861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2861, R.dtype("float16"))
        _2157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_v_proj_bias4, alloc2158)
        R.vm.kill_object(alloc2155)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias4)
        gv2862: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1135: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2158, gv2862, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2158)
        gv2863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2159: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2863, R.dtype("float16"))
        cls.concatenate1(reshape1133, reshape1134, reshape1135, alloc2159)
        R.vm.kill_object(reshape1133)
        R.vm.kill_object(reshape1134)
        R.vm.kill_object(reshape1135)
        gv2864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1136: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2159, gv2864, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2159)
        gv2865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2865, R.dtype("float16"))
        _2159: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1136, alloc2160)
        R.vm.kill_object(reshape1136)
        gv2866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1137: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2160, gv2866, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2160)
        gv2867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1138: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1137, gv2867, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1137)
        model_decoder_layers_10_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[734]
        model_decoder_layers_10_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[735]
        gv2868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2868, R.dtype("float16"))
        _2160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight4, reshape1138, model_decoder_layers_10_self_attn_out_proj_bias4, alloc2161)
        R.vm.kill_object(reshape1138)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias4)
        gv2869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2869, R.dtype("float16"))
        cls.add5(alloc2154, alloc2161, alloc2162)
        R.vm.kill_object(alloc2154)
        R.vm.kill_object(alloc2161)
        model_decoder_layers_10_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[745]
        model_decoder_layers_10_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[746]
        gv2870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2870, R.dtype("float16"))
        cls.layer_norm2(alloc2162, model_decoder_layers_10_encoder_attn_layer_norm_weight4, model_decoder_layers_10_encoder_attn_layer_norm_bias4, alloc2163)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias4)
        model_decoder_layers_10_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[741]
        model_decoder_layers_10_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[742]
        gv2871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2871, R.dtype("float16"))
        _2163: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight4, alloc2163, model_decoder_layers_10_encoder_attn_q_proj_bias4, alloc2164)
        R.vm.kill_object(alloc2163)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias4)
        gv2872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1139: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2164, gv2872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2164)
        gv2873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1140: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1139, gv2873, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1139)
        gv2874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2874, R.dtype("float16"))
        _2164: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1140, alloc2165)
        R.vm.kill_object(reshape1140)
        gv2875: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1141: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2165, gv2875, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2165)
        gv2876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1142: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1141, gv2876, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1141)
        model_decoder_layers_10_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[743]
        model_decoder_layers_10_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[744]
        gv2877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2877, R.dtype("float16"))
        _2165: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight4, reshape1142, model_decoder_layers_10_encoder_attn_out_proj_bias4, alloc2166)
        R.vm.kill_object(reshape1142)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias4)
        gv2878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2878, R.dtype("float16"))
        cls.add5(alloc2162, alloc2166, alloc2167)
        R.vm.kill_object(alloc2162)
        R.vm.kill_object(alloc2166)
        model_decoder_layers_10_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[751]
        model_decoder_layers_10_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[752]
        gv2879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2879, R.dtype("float16"))
        cls.layer_norm2(alloc2167, model_decoder_layers_10_final_layer_norm_weight4, model_decoder_layers_10_final_layer_norm_bias4, alloc2168)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias4)
        model_decoder_layers_10_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[747]
        model_decoder_layers_10_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[748]
        gv2880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2880, R.dtype("float16"))
        _2168: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight4, alloc2168, model_decoder_layers_10_fc1_bias4, alloc2169)
        R.vm.kill_object(alloc2168)
        R.vm.kill_object(model_decoder_layers_10_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_10_fc1_bias4)
        model_decoder_layers_10_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[749]
        model_decoder_layers_10_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[750]
        gv2881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2881, R.dtype("float16"))
        _2169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight4, alloc2169, model_decoder_layers_10_fc2_bias4, alloc2170)
        R.vm.kill_object(alloc2169)
        R.vm.kill_object(model_decoder_layers_10_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_10_fc2_bias4)
        gv2882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2882, R.dtype("float16"))
        cls.add5(alloc2167, alloc2170, alloc2171)
        R.vm.kill_object(alloc2167)
        R.vm.kill_object(alloc2170)
        model_decoder_layers_11_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[760]
        model_decoder_layers_11_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[761]
        gv2883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2883, R.dtype("float16"))
        cls.layer_norm2(alloc2171, model_decoder_layers_11_self_attn_layer_norm_weight4, model_decoder_layers_11_self_attn_layer_norm_bias4, alloc2172)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias4)
        model_decoder_layers_11_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[756]
        model_decoder_layers_11_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[757]
        gv2884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2884, R.dtype("float16"))
        _2172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_q_proj_bias4, alloc2173)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias4)
        gv2885: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1143: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2173, gv2885, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2173)
        model_decoder_layers_11_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[753]
        gv2886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2886, R.dtype("float16"))
        _2173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight4, alloc2172, alloc2174)
        R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight4)
        gv2887: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1144: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2174, gv2887, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2174)
        model_decoder_layers_11_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[754]
        model_decoder_layers_11_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[755]
        gv2888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2888, R.dtype("float16"))
        _2174: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_v_proj_bias4, alloc2175)
        R.vm.kill_object(alloc2172)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias4)
        gv2889: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1145: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2175, gv2889, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2175)
        gv2890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2176: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2890, R.dtype("float16"))
        cls.concatenate1(reshape1143, reshape1144, reshape1145, alloc2176)
        R.vm.kill_object(reshape1143)
        R.vm.kill_object(reshape1144)
        R.vm.kill_object(reshape1145)
        gv2891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1146: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2176, gv2891, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2176)
        gv2892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2892, R.dtype("float16"))
        _2176: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1146, alloc2177)
        R.vm.kill_object(reshape1146)
        gv2893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1147: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2177, gv2893, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2177)
        gv2894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1148: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1147, gv2894, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1147)
        model_decoder_layers_11_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[758]
        model_decoder_layers_11_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[759]
        gv2895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2895, R.dtype("float16"))
        _2177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight4, reshape1148, model_decoder_layers_11_self_attn_out_proj_bias4, alloc2178)
        R.vm.kill_object(reshape1148)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias4)
        gv2896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2896, R.dtype("float16"))
        cls.add5(alloc2171, alloc2178, alloc2179)
        R.vm.kill_object(alloc2171)
        R.vm.kill_object(alloc2178)
        model_decoder_layers_11_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[769]
        model_decoder_layers_11_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[770]
        gv2897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2897, R.dtype("float16"))
        cls.layer_norm2(alloc2179, model_decoder_layers_11_encoder_attn_layer_norm_weight4, model_decoder_layers_11_encoder_attn_layer_norm_bias4, alloc2180)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias4)
        model_decoder_layers_11_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[765]
        model_decoder_layers_11_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[766]
        gv2898: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2181: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2898, R.dtype("float16"))
        _2180: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight4, alloc2180, model_decoder_layers_11_encoder_attn_q_proj_bias4, alloc2181)
        R.vm.kill_object(alloc2180)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias4)
        gv2899: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1149: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2181, gv2899, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2181)
        gv2900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1150: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1149, gv2900, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1149)
        gv2901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2901, R.dtype("float16"))
        _2181: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1150, alloc2182)
        R.vm.kill_object(reshape1150)
        gv2902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1151: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2182, gv2902, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2182)
        gv2903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1152: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1151, gv2903, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1151)
        model_decoder_layers_11_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[767]
        model_decoder_layers_11_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[768]
        gv2904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2904, R.dtype("float16"))
        _2182: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight4, reshape1152, model_decoder_layers_11_encoder_attn_out_proj_bias4, alloc2183)
        R.vm.kill_object(reshape1152)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias4)
        gv2905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2905, R.dtype("float16"))
        cls.add5(alloc2179, alloc2183, alloc2184)
        R.vm.kill_object(alloc2179)
        R.vm.kill_object(alloc2183)
        model_decoder_layers_11_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[775]
        model_decoder_layers_11_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[776]
        gv2906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2906, R.dtype("float16"))
        cls.layer_norm2(alloc2184, model_decoder_layers_11_final_layer_norm_weight4, model_decoder_layers_11_final_layer_norm_bias4, alloc2185)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias4)
        model_decoder_layers_11_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[771]
        model_decoder_layers_11_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[772]
        gv2907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2907, R.dtype("float16"))
        _2185: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight4, alloc2185, model_decoder_layers_11_fc1_bias4, alloc2186)
        R.vm.kill_object(alloc2185)
        R.vm.kill_object(model_decoder_layers_11_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_11_fc1_bias4)
        model_decoder_layers_11_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[773]
        model_decoder_layers_11_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[774]
        gv2908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2908, R.dtype("float16"))
        _2186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight4, alloc2186, model_decoder_layers_11_fc2_bias4, alloc2187)
        R.vm.kill_object(alloc2186)
        R.vm.kill_object(model_decoder_layers_11_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_11_fc2_bias4)
        gv2909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2909, R.dtype("float16"))
        cls.add5(alloc2184, alloc2187, alloc2188)
        R.vm.kill_object(alloc2184)
        R.vm.kill_object(alloc2187)
        model_decoder_layers_12_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[784]
        model_decoder_layers_12_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[785]
        gv2910: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2910, R.dtype("float16"))
        cls.layer_norm2(alloc2188, model_decoder_layers_12_self_attn_layer_norm_weight4, model_decoder_layers_12_self_attn_layer_norm_bias4, alloc2189)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias4)
        model_decoder_layers_12_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[780]
        model_decoder_layers_12_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[781]
        gv2911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2911, R.dtype("float16"))
        _2189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_q_proj_bias4, alloc2190)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias4)
        gv2912: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1153: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2190, gv2912, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2190)
        model_decoder_layers_12_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[777]
        gv2913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2913, R.dtype("float16"))
        _2190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight4, alloc2189, alloc2191)
        R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight4)
        gv2914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1154: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2191, gv2914, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2191)
        model_decoder_layers_12_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[778]
        model_decoder_layers_12_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[779]
        gv2915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2915, R.dtype("float16"))
        _2191: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_v_proj_bias4, alloc2192)
        R.vm.kill_object(alloc2189)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias4)
        gv2916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1155: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2192, gv2916, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2192)
        gv2917: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2193: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2917, R.dtype("float16"))
        cls.concatenate1(reshape1153, reshape1154, reshape1155, alloc2193)
        R.vm.kill_object(reshape1153)
        R.vm.kill_object(reshape1154)
        R.vm.kill_object(reshape1155)
        gv2918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1156: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2193, gv2918, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2193)
        gv2919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2919, R.dtype("float16"))
        _2193: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1156, alloc2194)
        R.vm.kill_object(reshape1156)
        gv2920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1157: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2194, gv2920, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2194)
        gv2921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1158: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1157, gv2921, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1157)
        model_decoder_layers_12_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[782]
        model_decoder_layers_12_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[783]
        gv2922: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2922, R.dtype("float16"))
        _2194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight4, reshape1158, model_decoder_layers_12_self_attn_out_proj_bias4, alloc2195)
        R.vm.kill_object(reshape1158)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias4)
        gv2923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2923, R.dtype("float16"))
        cls.add5(alloc2188, alloc2195, alloc2196)
        R.vm.kill_object(alloc2188)
        R.vm.kill_object(alloc2195)
        model_decoder_layers_12_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[793]
        model_decoder_layers_12_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[794]
        gv2924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2924, R.dtype("float16"))
        cls.layer_norm2(alloc2196, model_decoder_layers_12_encoder_attn_layer_norm_weight4, model_decoder_layers_12_encoder_attn_layer_norm_bias4, alloc2197)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias4)
        model_decoder_layers_12_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[789]
        model_decoder_layers_12_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[790]
        gv2925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2198: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2925, R.dtype("float16"))
        _2197: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight4, alloc2197, model_decoder_layers_12_encoder_attn_q_proj_bias4, alloc2198)
        R.vm.kill_object(alloc2197)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias4)
        gv2926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1159: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2198, gv2926, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2198)
        gv2927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1160: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1159, gv2927, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1159)
        gv2928: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2928, R.dtype("float16"))
        _2198: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1160, alloc2199)
        R.vm.kill_object(reshape1160)
        gv2929: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1161: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2199, gv2929, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2199)
        gv2930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1162: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1161, gv2930, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1161)
        model_decoder_layers_12_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[791]
        model_decoder_layers_12_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[792]
        gv2931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2931, R.dtype("float16"))
        _2199: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight4, reshape1162, model_decoder_layers_12_encoder_attn_out_proj_bias4, alloc2200)
        R.vm.kill_object(reshape1162)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias4)
        gv2932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2932, R.dtype("float16"))
        cls.add5(alloc2196, alloc2200, alloc2201)
        R.vm.kill_object(alloc2196)
        R.vm.kill_object(alloc2200)
        model_decoder_layers_12_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[799]
        model_decoder_layers_12_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[800]
        gv2933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2933, R.dtype("float16"))
        cls.layer_norm2(alloc2201, model_decoder_layers_12_final_layer_norm_weight4, model_decoder_layers_12_final_layer_norm_bias4, alloc2202)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias4)
        model_decoder_layers_12_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[795]
        model_decoder_layers_12_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[796]
        gv2934: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2934, R.dtype("float16"))
        _2202: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight4, alloc2202, model_decoder_layers_12_fc1_bias4, alloc2203)
        R.vm.kill_object(alloc2202)
        R.vm.kill_object(model_decoder_layers_12_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_12_fc1_bias4)
        model_decoder_layers_12_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[797]
        model_decoder_layers_12_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[798]
        gv2935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2935, R.dtype("float16"))
        _2203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight4, alloc2203, model_decoder_layers_12_fc2_bias4, alloc2204)
        R.vm.kill_object(alloc2203)
        R.vm.kill_object(model_decoder_layers_12_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_12_fc2_bias4)
        gv2936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2936, R.dtype("float16"))
        cls.add5(alloc2201, alloc2204, alloc2205)
        R.vm.kill_object(alloc2201)
        R.vm.kill_object(alloc2204)
        model_decoder_layers_13_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[808]
        model_decoder_layers_13_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[809]
        gv2937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2937, R.dtype("float16"))
        cls.layer_norm2(alloc2205, model_decoder_layers_13_self_attn_layer_norm_weight4, model_decoder_layers_13_self_attn_layer_norm_bias4, alloc2206)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias4)
        model_decoder_layers_13_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[804]
        model_decoder_layers_13_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[805]
        gv2938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2938, R.dtype("float16"))
        _2206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_q_proj_bias4, alloc2207)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias4)
        gv2939: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1163: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2207, gv2939, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2207)
        model_decoder_layers_13_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[801]
        gv2940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2940, R.dtype("float16"))
        _2207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight4, alloc2206, alloc2208)
        R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight4)
        gv2941: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1164: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2208, gv2941, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2208)
        model_decoder_layers_13_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[802]
        model_decoder_layers_13_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[803]
        gv2942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2942, R.dtype("float16"))
        _2208: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_v_proj_bias4, alloc2209)
        R.vm.kill_object(alloc2206)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias4)
        gv2943: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1165: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2209, gv2943, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2209)
        gv2944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2210: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2944, R.dtype("float16"))
        cls.concatenate1(reshape1163, reshape1164, reshape1165, alloc2210)
        R.vm.kill_object(reshape1163)
        R.vm.kill_object(reshape1164)
        R.vm.kill_object(reshape1165)
        gv2945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1166: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2210, gv2945, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2210)
        gv2946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2946, R.dtype("float16"))
        _2210: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1166, alloc2211)
        R.vm.kill_object(reshape1166)
        gv2947: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1167: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2211, gv2947, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2211)
        gv2948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1168: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1167, gv2948, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1167)
        model_decoder_layers_13_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[806]
        model_decoder_layers_13_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[807]
        gv2949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2949, R.dtype("float16"))
        _2211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight4, reshape1168, model_decoder_layers_13_self_attn_out_proj_bias4, alloc2212)
        R.vm.kill_object(reshape1168)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias4)
        gv2950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2950, R.dtype("float16"))
        cls.add5(alloc2205, alloc2212, alloc2213)
        R.vm.kill_object(alloc2205)
        R.vm.kill_object(alloc2212)
        model_decoder_layers_13_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[817]
        model_decoder_layers_13_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[818]
        gv2951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2951, R.dtype("float16"))
        cls.layer_norm2(alloc2213, model_decoder_layers_13_encoder_attn_layer_norm_weight4, model_decoder_layers_13_encoder_attn_layer_norm_bias4, alloc2214)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias4)
        model_decoder_layers_13_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[813]
        model_decoder_layers_13_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[814]
        gv2952: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2215: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2952, R.dtype("float16"))
        _2214: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight4, alloc2214, model_decoder_layers_13_encoder_attn_q_proj_bias4, alloc2215)
        R.vm.kill_object(alloc2214)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias4)
        gv2953: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1169: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2215, gv2953, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2215)
        gv2954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1170: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1169, gv2954, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1169)
        gv2955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2955, R.dtype("float16"))
        _2215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1170, alloc2216)
        R.vm.kill_object(reshape1170)
        gv2956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1171: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2216, gv2956, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2216)
        gv2957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1172: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1171, gv2957, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1171)
        model_decoder_layers_13_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[815]
        model_decoder_layers_13_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[816]
        gv2958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2958, R.dtype("float16"))
        _2216: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight4, reshape1172, model_decoder_layers_13_encoder_attn_out_proj_bias4, alloc2217)
        R.vm.kill_object(reshape1172)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias4)
        gv2959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2959, R.dtype("float16"))
        cls.add5(alloc2213, alloc2217, alloc2218)
        R.vm.kill_object(alloc2213)
        R.vm.kill_object(alloc2217)
        model_decoder_layers_13_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[823]
        model_decoder_layers_13_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[824]
        gv2960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2960, R.dtype("float16"))
        cls.layer_norm2(alloc2218, model_decoder_layers_13_final_layer_norm_weight4, model_decoder_layers_13_final_layer_norm_bias4, alloc2219)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias4)
        model_decoder_layers_13_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[819]
        model_decoder_layers_13_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[820]
        gv2961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2961, R.dtype("float16"))
        _2219: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight4, alloc2219, model_decoder_layers_13_fc1_bias4, alloc2220)
        R.vm.kill_object(alloc2219)
        R.vm.kill_object(model_decoder_layers_13_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_13_fc1_bias4)
        model_decoder_layers_13_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[821]
        model_decoder_layers_13_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[822]
        gv2962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2962, R.dtype("float16"))
        _2220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight4, alloc2220, model_decoder_layers_13_fc2_bias4, alloc2221)
        R.vm.kill_object(alloc2220)
        R.vm.kill_object(model_decoder_layers_13_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_13_fc2_bias4)
        gv2963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2963, R.dtype("float16"))
        cls.add5(alloc2218, alloc2221, alloc2222)
        R.vm.kill_object(alloc2218)
        R.vm.kill_object(alloc2221)
        model_decoder_layers_14_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[832]
        model_decoder_layers_14_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[833]
        gv2964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2964, R.dtype("float16"))
        cls.layer_norm2(alloc2222, model_decoder_layers_14_self_attn_layer_norm_weight4, model_decoder_layers_14_self_attn_layer_norm_bias4, alloc2223)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias4)
        model_decoder_layers_14_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[828]
        model_decoder_layers_14_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[829]
        gv2965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2965, R.dtype("float16"))
        _2223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_q_proj_bias4, alloc2224)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias4)
        gv2966: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1173: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2224, gv2966, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2224)
        model_decoder_layers_14_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[825]
        gv2967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2967, R.dtype("float16"))
        _2224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight4, alloc2223, alloc2225)
        R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight4)
        gv2968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1174: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2225, gv2968, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2225)
        model_decoder_layers_14_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[826]
        model_decoder_layers_14_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[827]
        gv2969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2969, R.dtype("float16"))
        _2225: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_v_proj_bias4, alloc2226)
        R.vm.kill_object(alloc2223)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias4)
        gv2970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1175: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2226, gv2970, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2226)
        gv2971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2227: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2971, R.dtype("float16"))
        cls.concatenate1(reshape1173, reshape1174, reshape1175, alloc2227)
        R.vm.kill_object(reshape1173)
        R.vm.kill_object(reshape1174)
        R.vm.kill_object(reshape1175)
        gv2972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1176: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2227, gv2972, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2227)
        gv2973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2973, R.dtype("float16"))
        _2227: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1176, alloc2228)
        R.vm.kill_object(reshape1176)
        gv2974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1177: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2228, gv2974, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2228)
        gv2975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1178: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1177, gv2975, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1177)
        model_decoder_layers_14_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[830]
        model_decoder_layers_14_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[831]
        gv2976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2976, R.dtype("float16"))
        _2228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight4, reshape1178, model_decoder_layers_14_self_attn_out_proj_bias4, alloc2229)
        R.vm.kill_object(reshape1178)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias4)
        gv2977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2977, R.dtype("float16"))
        cls.add5(alloc2222, alloc2229, alloc2230)
        R.vm.kill_object(alloc2222)
        R.vm.kill_object(alloc2229)
        model_decoder_layers_14_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[841]
        model_decoder_layers_14_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[842]
        gv2978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2978, R.dtype("float16"))
        cls.layer_norm2(alloc2230, model_decoder_layers_14_encoder_attn_layer_norm_weight4, model_decoder_layers_14_encoder_attn_layer_norm_bias4, alloc2231)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias4)
        model_decoder_layers_14_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[837]
        model_decoder_layers_14_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[838]
        gv2979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2232: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2979, R.dtype("float16"))
        _2231: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight4, alloc2231, model_decoder_layers_14_encoder_attn_q_proj_bias4, alloc2232)
        R.vm.kill_object(alloc2231)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias4)
        gv2980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1179: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2232, gv2980, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2232)
        gv2981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1180: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1179, gv2981, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1179)
        gv2982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2982, R.dtype("float16"))
        _2232: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1180, alloc2233)
        R.vm.kill_object(reshape1180)
        gv2983: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1181: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2233, gv2983, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2233)
        gv2984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1182: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1181, gv2984, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1181)
        model_decoder_layers_14_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[839]
        model_decoder_layers_14_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[840]
        gv2985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2985, R.dtype("float16"))
        _2233: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight4, reshape1182, model_decoder_layers_14_encoder_attn_out_proj_bias4, alloc2234)
        R.vm.kill_object(reshape1182)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias4)
        gv2986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2986, R.dtype("float16"))
        cls.add5(alloc2230, alloc2234, alloc2235)
        R.vm.kill_object(alloc2230)
        R.vm.kill_object(alloc2234)
        model_decoder_layers_14_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[847]
        model_decoder_layers_14_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[848]
        gv2987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2987, R.dtype("float16"))
        cls.layer_norm2(alloc2235, model_decoder_layers_14_final_layer_norm_weight4, model_decoder_layers_14_final_layer_norm_bias4, alloc2236)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias4)
        model_decoder_layers_14_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[843]
        model_decoder_layers_14_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[844]
        gv2988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2988, R.dtype("float16"))
        _2236: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight4, alloc2236, model_decoder_layers_14_fc1_bias4, alloc2237)
        R.vm.kill_object(alloc2236)
        R.vm.kill_object(model_decoder_layers_14_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_14_fc1_bias4)
        model_decoder_layers_14_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[845]
        model_decoder_layers_14_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[846]
        gv2989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2989, R.dtype("float16"))
        _2237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight4, alloc2237, model_decoder_layers_14_fc2_bias4, alloc2238)
        R.vm.kill_object(alloc2237)
        R.vm.kill_object(model_decoder_layers_14_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_14_fc2_bias4)
        gv2990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2990, R.dtype("float16"))
        cls.add5(alloc2235, alloc2238, alloc2239)
        R.vm.kill_object(alloc2235)
        R.vm.kill_object(alloc2238)
        model_decoder_layers_15_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[856]
        model_decoder_layers_15_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[857]
        gv2991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2991, R.dtype("float16"))
        cls.layer_norm2(alloc2239, model_decoder_layers_15_self_attn_layer_norm_weight4, model_decoder_layers_15_self_attn_layer_norm_bias4, alloc2240)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias4)
        model_decoder_layers_15_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[852]
        model_decoder_layers_15_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[853]
        gv2992: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2992, R.dtype("float16"))
        _2240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_q_proj_bias4, alloc2241)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias4)
        gv2993: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1183: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2241, gv2993, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2241)
        model_decoder_layers_15_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[849]
        gv2994: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2994, R.dtype("float16"))
        _2241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight4, alloc2240, alloc2242)
        R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight4)
        gv2995: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1184: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2242, gv2995, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2242)
        model_decoder_layers_15_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[850]
        model_decoder_layers_15_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[851]
        gv2996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2996, R.dtype("float16"))
        _2242: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_v_proj_bias4, alloc2243)
        R.vm.kill_object(alloc2240)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias4)
        gv2997: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1185: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2243, gv2997, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2243)
        gv2998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2244: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2998, R.dtype("float16"))
        cls.concatenate1(reshape1183, reshape1184, reshape1185, alloc2244)
        R.vm.kill_object(reshape1183)
        R.vm.kill_object(reshape1184)
        R.vm.kill_object(reshape1185)
        gv2999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1186: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2244, gv2999, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2244)
        gv3000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3000, R.dtype("float16"))
        _2244: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1186, alloc2245)
        R.vm.kill_object(reshape1186)
        gv3001: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1187: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2245, gv3001, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2245)
        gv3002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1188: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1187, gv3002, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1187)
        model_decoder_layers_15_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[854]
        model_decoder_layers_15_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[855]
        gv3003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3003, R.dtype("float16"))
        _2245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight4, reshape1188, model_decoder_layers_15_self_attn_out_proj_bias4, alloc2246)
        R.vm.kill_object(reshape1188)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias4)
        gv3004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3004, R.dtype("float16"))
        cls.add5(alloc2239, alloc2246, alloc2247)
        R.vm.kill_object(alloc2239)
        R.vm.kill_object(alloc2246)
        model_decoder_layers_15_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[865]
        model_decoder_layers_15_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[866]
        gv3005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3005, R.dtype("float16"))
        cls.layer_norm2(alloc2247, model_decoder_layers_15_encoder_attn_layer_norm_weight4, model_decoder_layers_15_encoder_attn_layer_norm_bias4, alloc2248)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias4)
        model_decoder_layers_15_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[861]
        model_decoder_layers_15_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[862]
        gv3006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2249: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3006, R.dtype("float16"))
        _2248: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight4, alloc2248, model_decoder_layers_15_encoder_attn_q_proj_bias4, alloc2249)
        R.vm.kill_object(alloc2248)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias4)
        gv3007: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1189: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2249, gv3007, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2249)
        gv3008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1190: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1189, gv3008, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1189)
        gv3009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3009, R.dtype("float16"))
        _2249: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1190, alloc2250)
        R.vm.kill_object(reshape1190)
        gv3010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1191: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2250, gv3010, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2250)
        gv3011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1192: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1191, gv3011, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1191)
        model_decoder_layers_15_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[863]
        model_decoder_layers_15_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[864]
        gv3012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3012, R.dtype("float16"))
        _2250: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight4, reshape1192, model_decoder_layers_15_encoder_attn_out_proj_bias4, alloc2251)
        R.vm.kill_object(reshape1192)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias4)
        gv3013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3013, R.dtype("float16"))
        cls.add5(alloc2247, alloc2251, alloc2252)
        R.vm.kill_object(alloc2247)
        R.vm.kill_object(alloc2251)
        model_decoder_layers_15_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[871]
        model_decoder_layers_15_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[872]
        gv3014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3014, R.dtype("float16"))
        cls.layer_norm2(alloc2252, model_decoder_layers_15_final_layer_norm_weight4, model_decoder_layers_15_final_layer_norm_bias4, alloc2253)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias4)
        model_decoder_layers_15_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[867]
        model_decoder_layers_15_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[868]
        gv3015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3015, R.dtype("float16"))
        _2253: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight4, alloc2253, model_decoder_layers_15_fc1_bias4, alloc2254)
        R.vm.kill_object(alloc2253)
        R.vm.kill_object(model_decoder_layers_15_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_15_fc1_bias4)
        model_decoder_layers_15_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[869]
        model_decoder_layers_15_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[870]
        gv3016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3016, R.dtype("float16"))
        _2254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight4, alloc2254, model_decoder_layers_15_fc2_bias4, alloc2255)
        R.vm.kill_object(alloc2254)
        R.vm.kill_object(model_decoder_layers_15_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_15_fc2_bias4)
        gv3017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3017, R.dtype("float16"))
        cls.add5(alloc2252, alloc2255, alloc2256)
        R.vm.kill_object(alloc2252)
        R.vm.kill_object(alloc2255)
        model_decoder_layers_16_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[880]
        model_decoder_layers_16_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[881]
        gv3018: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3018, R.dtype("float16"))
        cls.layer_norm2(alloc2256, model_decoder_layers_16_self_attn_layer_norm_weight4, model_decoder_layers_16_self_attn_layer_norm_bias4, alloc2257)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias4)
        model_decoder_layers_16_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[876]
        model_decoder_layers_16_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[877]
        gv3019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3019, R.dtype("float16"))
        _2257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_q_proj_bias4, alloc2258)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias4)
        gv3020: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1193: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2258, gv3020, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2258)
        model_decoder_layers_16_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[873]
        gv3021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3021, R.dtype("float16"))
        _2258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight4, alloc2257, alloc2259)
        R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight4)
        gv3022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1194: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2259, gv3022, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2259)
        model_decoder_layers_16_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[874]
        model_decoder_layers_16_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[875]
        gv3023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3023, R.dtype("float16"))
        _2259: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_v_proj_bias4, alloc2260)
        R.vm.kill_object(alloc2257)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias4)
        gv3024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1195: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2260, gv3024, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2260)
        gv3025: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2261: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3025, R.dtype("float16"))
        cls.concatenate1(reshape1193, reshape1194, reshape1195, alloc2261)
        R.vm.kill_object(reshape1193)
        R.vm.kill_object(reshape1194)
        R.vm.kill_object(reshape1195)
        gv3026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1196: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2261, gv3026, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2261)
        gv3027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3027, R.dtype("float16"))
        _2261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1196, alloc2262)
        R.vm.kill_object(reshape1196)
        gv3028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1197: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2262, gv3028, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2262)
        gv3029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1198: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1197, gv3029, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1197)
        model_decoder_layers_16_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[878]
        model_decoder_layers_16_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[879]
        gv3030: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3030, R.dtype("float16"))
        _2262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight4, reshape1198, model_decoder_layers_16_self_attn_out_proj_bias4, alloc2263)
        R.vm.kill_object(reshape1198)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias4)
        gv3031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3031, R.dtype("float16"))
        cls.add5(alloc2256, alloc2263, alloc2264)
        R.vm.kill_object(alloc2256)
        R.vm.kill_object(alloc2263)
        model_decoder_layers_16_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[889]
        model_decoder_layers_16_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[890]
        gv3032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3032, R.dtype("float16"))
        cls.layer_norm2(alloc2264, model_decoder_layers_16_encoder_attn_layer_norm_weight4, model_decoder_layers_16_encoder_attn_layer_norm_bias4, alloc2265)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias4)
        model_decoder_layers_16_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[885]
        model_decoder_layers_16_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[886]
        gv3033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2266: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3033, R.dtype("float16"))
        _2265: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight4, alloc2265, model_decoder_layers_16_encoder_attn_q_proj_bias4, alloc2266)
        R.vm.kill_object(alloc2265)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias4)
        gv3034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1199: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2266, gv3034, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2266)
        gv3035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1200: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1199, gv3035, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1199)
        gv3036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3036, R.dtype("float16"))
        _2266: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1200, alloc2267)
        R.vm.kill_object(reshape1200)
        gv3037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1201: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2267, gv3037, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2267)
        gv3038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1202: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1201, gv3038, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1201)
        model_decoder_layers_16_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[887]
        model_decoder_layers_16_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[888]
        gv3039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3039, R.dtype("float16"))
        _2267: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight4, reshape1202, model_decoder_layers_16_encoder_attn_out_proj_bias4, alloc2268)
        R.vm.kill_object(reshape1202)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias4)
        gv3040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3040, R.dtype("float16"))
        cls.add5(alloc2264, alloc2268, alloc2269)
        R.vm.kill_object(alloc2264)
        R.vm.kill_object(alloc2268)
        model_decoder_layers_16_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[895]
        model_decoder_layers_16_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[896]
        gv3041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3041, R.dtype("float16"))
        cls.layer_norm2(alloc2269, model_decoder_layers_16_final_layer_norm_weight4, model_decoder_layers_16_final_layer_norm_bias4, alloc2270)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias4)
        model_decoder_layers_16_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[891]
        model_decoder_layers_16_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[892]
        gv3042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3042, R.dtype("float16"))
        _2270: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight4, alloc2270, model_decoder_layers_16_fc1_bias4, alloc2271)
        R.vm.kill_object(alloc2270)
        R.vm.kill_object(model_decoder_layers_16_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_16_fc1_bias4)
        model_decoder_layers_16_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[893]
        model_decoder_layers_16_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[894]
        gv3043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3043, R.dtype("float16"))
        _2271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight4, alloc2271, model_decoder_layers_16_fc2_bias4, alloc2272)
        R.vm.kill_object(alloc2271)
        R.vm.kill_object(model_decoder_layers_16_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_16_fc2_bias4)
        gv3044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3044, R.dtype("float16"))
        cls.add5(alloc2269, alloc2272, alloc2273)
        R.vm.kill_object(alloc2269)
        R.vm.kill_object(alloc2272)
        model_decoder_layers_17_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[904]
        model_decoder_layers_17_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[905]
        gv3045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3045, R.dtype("float16"))
        cls.layer_norm2(alloc2273, model_decoder_layers_17_self_attn_layer_norm_weight4, model_decoder_layers_17_self_attn_layer_norm_bias4, alloc2274)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias4)
        model_decoder_layers_17_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[900]
        model_decoder_layers_17_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[901]
        gv3046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3046, R.dtype("float16"))
        _2274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_q_proj_bias4, alloc2275)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias4)
        gv3047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1203: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2275, gv3047, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2275)
        model_decoder_layers_17_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[897]
        gv3048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3048, R.dtype("float16"))
        _2275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight4, alloc2274, alloc2276)
        R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight4)
        gv3049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1204: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2276, gv3049, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2276)
        model_decoder_layers_17_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[898]
        model_decoder_layers_17_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[899]
        gv3050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3050, R.dtype("float16"))
        _2276: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_v_proj_bias4, alloc2277)
        R.vm.kill_object(alloc2274)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias4)
        gv3051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1205: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2277, gv3051, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2277)
        gv3052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2278: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3052, R.dtype("float16"))
        cls.concatenate1(reshape1203, reshape1204, reshape1205, alloc2278)
        R.vm.kill_object(reshape1203)
        R.vm.kill_object(reshape1204)
        R.vm.kill_object(reshape1205)
        gv3053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1206: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2278, gv3053, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2278)
        gv3054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3054, R.dtype("float16"))
        _2278: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1206, alloc2279)
        R.vm.kill_object(reshape1206)
        gv3055: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1207: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2279, gv3055, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2279)
        gv3056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1208: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1207, gv3056, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1207)
        model_decoder_layers_17_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[902]
        model_decoder_layers_17_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[903]
        gv3057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3057, R.dtype("float16"))
        _2279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight4, reshape1208, model_decoder_layers_17_self_attn_out_proj_bias4, alloc2280)
        R.vm.kill_object(reshape1208)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias4)
        gv3058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3058, R.dtype("float16"))
        cls.add5(alloc2273, alloc2280, alloc2281)
        R.vm.kill_object(alloc2273)
        R.vm.kill_object(alloc2280)
        model_decoder_layers_17_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[913]
        model_decoder_layers_17_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[914]
        gv3059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3059, R.dtype("float16"))
        cls.layer_norm2(alloc2281, model_decoder_layers_17_encoder_attn_layer_norm_weight4, model_decoder_layers_17_encoder_attn_layer_norm_bias4, alloc2282)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias4)
        model_decoder_layers_17_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[909]
        model_decoder_layers_17_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[910]
        gv3060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2283: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3060, R.dtype("float16"))
        _2282: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight4, alloc2282, model_decoder_layers_17_encoder_attn_q_proj_bias4, alloc2283)
        R.vm.kill_object(alloc2282)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias4)
        gv3061: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1209: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2283, gv3061, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2283)
        gv3062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1210: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1209, gv3062, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1209)
        gv3063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3063, R.dtype("float16"))
        _2283: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1210, alloc2284)
        R.vm.kill_object(reshape1210)
        gv3064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1211: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2284, gv3064, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2284)
        gv3065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1212: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1211, gv3065, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1211)
        model_decoder_layers_17_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[911]
        model_decoder_layers_17_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[912]
        gv3066: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3066, R.dtype("float16"))
        _2284: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight4, reshape1212, model_decoder_layers_17_encoder_attn_out_proj_bias4, alloc2285)
        R.vm.kill_object(reshape1212)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias4)
        gv3067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3067, R.dtype("float16"))
        cls.add5(alloc2281, alloc2285, alloc2286)
        R.vm.kill_object(alloc2281)
        R.vm.kill_object(alloc2285)
        model_decoder_layers_17_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[919]
        model_decoder_layers_17_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[920]
        gv3068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3068, R.dtype("float16"))
        cls.layer_norm2(alloc2286, model_decoder_layers_17_final_layer_norm_weight4, model_decoder_layers_17_final_layer_norm_bias4, alloc2287)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias4)
        model_decoder_layers_17_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[915]
        model_decoder_layers_17_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[916]
        gv3069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3069, R.dtype("float16"))
        _2287: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight4, alloc2287, model_decoder_layers_17_fc1_bias4, alloc2288)
        R.vm.kill_object(alloc2287)
        R.vm.kill_object(model_decoder_layers_17_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_17_fc1_bias4)
        model_decoder_layers_17_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[917]
        model_decoder_layers_17_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[918]
        gv3070: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3070, R.dtype("float16"))
        _2288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight4, alloc2288, model_decoder_layers_17_fc2_bias4, alloc2289)
        R.vm.kill_object(alloc2288)
        R.vm.kill_object(model_decoder_layers_17_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_17_fc2_bias4)
        gv3071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3071, R.dtype("float16"))
        cls.add5(alloc2286, alloc2289, alloc2290)
        R.vm.kill_object(alloc2286)
        R.vm.kill_object(alloc2289)
        model_decoder_layers_18_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[928]
        model_decoder_layers_18_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[929]
        gv3072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3072, R.dtype("float16"))
        cls.layer_norm2(alloc2290, model_decoder_layers_18_self_attn_layer_norm_weight4, model_decoder_layers_18_self_attn_layer_norm_bias4, alloc2291)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias4)
        model_decoder_layers_18_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[924]
        model_decoder_layers_18_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[925]
        gv3073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3073, R.dtype("float16"))
        _2291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_q_proj_bias4, alloc2292)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias4)
        gv3074: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1213: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2292, gv3074, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2292)
        model_decoder_layers_18_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[921]
        gv3075: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3075, R.dtype("float16"))
        _2292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight4, alloc2291, alloc2293)
        R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight4)
        gv3076: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1214: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2293, gv3076, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2293)
        model_decoder_layers_18_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[922]
        model_decoder_layers_18_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[923]
        gv3077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3077, R.dtype("float16"))
        _2293: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_v_proj_bias4, alloc2294)
        R.vm.kill_object(alloc2291)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias4)
        gv3078: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1215: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2294, gv3078, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2294)
        gv3079: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2295: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3079, R.dtype("float16"))
        cls.concatenate1(reshape1213, reshape1214, reshape1215, alloc2295)
        R.vm.kill_object(reshape1213)
        R.vm.kill_object(reshape1214)
        R.vm.kill_object(reshape1215)
        gv3080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1216: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2295, gv3080, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2295)
        gv3081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3081, R.dtype("float16"))
        _2295: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1216, alloc2296)
        R.vm.kill_object(reshape1216)
        gv3082: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1217: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2296, gv3082, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2296)
        gv3083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1218: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1217, gv3083, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1217)
        model_decoder_layers_18_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[926]
        model_decoder_layers_18_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[927]
        gv3084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3084, R.dtype("float16"))
        _2296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight4, reshape1218, model_decoder_layers_18_self_attn_out_proj_bias4, alloc2297)
        R.vm.kill_object(reshape1218)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias4)
        gv3085: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3085, R.dtype("float16"))
        cls.add5(alloc2290, alloc2297, alloc2298)
        R.vm.kill_object(alloc2290)
        R.vm.kill_object(alloc2297)
        model_decoder_layers_18_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[937]
        model_decoder_layers_18_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[938]
        gv3086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3086, R.dtype("float16"))
        cls.layer_norm2(alloc2298, model_decoder_layers_18_encoder_attn_layer_norm_weight4, model_decoder_layers_18_encoder_attn_layer_norm_bias4, alloc2299)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias4)
        model_decoder_layers_18_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[933]
        model_decoder_layers_18_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[934]
        gv3087: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2300: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3087, R.dtype("float16"))
        _2299: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight4, alloc2299, model_decoder_layers_18_encoder_attn_q_proj_bias4, alloc2300)
        R.vm.kill_object(alloc2299)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias4)
        gv3088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1219: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2300, gv3088, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2300)
        gv3089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1220: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1219, gv3089, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1219)
        gv3090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3090, R.dtype("float16"))
        _2300: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1220, alloc2301)
        R.vm.kill_object(reshape1220)
        gv3091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1221: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2301, gv3091, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2301)
        gv3092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1222: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1221, gv3092, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1221)
        model_decoder_layers_18_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[935]
        model_decoder_layers_18_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[936]
        gv3093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3093, R.dtype("float16"))
        _2301: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight4, reshape1222, model_decoder_layers_18_encoder_attn_out_proj_bias4, alloc2302)
        R.vm.kill_object(reshape1222)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias4)
        gv3094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3094, R.dtype("float16"))
        cls.add5(alloc2298, alloc2302, alloc2303)
        R.vm.kill_object(alloc2298)
        R.vm.kill_object(alloc2302)
        model_decoder_layers_18_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[943]
        model_decoder_layers_18_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[944]
        gv3095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3095, R.dtype("float16"))
        cls.layer_norm2(alloc2303, model_decoder_layers_18_final_layer_norm_weight4, model_decoder_layers_18_final_layer_norm_bias4, alloc2304)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias4)
        model_decoder_layers_18_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[939]
        model_decoder_layers_18_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[940]
        gv3096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3096, R.dtype("float16"))
        _2304: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight4, alloc2304, model_decoder_layers_18_fc1_bias4, alloc2305)
        R.vm.kill_object(alloc2304)
        R.vm.kill_object(model_decoder_layers_18_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_18_fc1_bias4)
        model_decoder_layers_18_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[941]
        model_decoder_layers_18_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[942]
        gv3097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3097, R.dtype("float16"))
        _2305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight4, alloc2305, model_decoder_layers_18_fc2_bias4, alloc2306)
        R.vm.kill_object(alloc2305)
        R.vm.kill_object(model_decoder_layers_18_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_18_fc2_bias4)
        gv3098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3098, R.dtype("float16"))
        cls.add5(alloc2303, alloc2306, alloc2307)
        R.vm.kill_object(alloc2303)
        R.vm.kill_object(alloc2306)
        model_decoder_layers_19_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[952]
        model_decoder_layers_19_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[953]
        gv3099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3099, R.dtype("float16"))
        cls.layer_norm2(alloc2307, model_decoder_layers_19_self_attn_layer_norm_weight4, model_decoder_layers_19_self_attn_layer_norm_bias4, alloc2308)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias4)
        model_decoder_layers_19_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[948]
        model_decoder_layers_19_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[949]
        gv3100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3100, R.dtype("float16"))
        _2308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_q_proj_bias4, alloc2309)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias4)
        gv3101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1223: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2309, gv3101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2309)
        model_decoder_layers_19_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[945]
        gv3102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3102, R.dtype("float16"))
        _2309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight4, alloc2308, alloc2310)
        R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight4)
        gv3103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1224: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2310, gv3103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2310)
        model_decoder_layers_19_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[946]
        model_decoder_layers_19_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[947]
        gv3104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3104, R.dtype("float16"))
        _2310: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_v_proj_bias4, alloc2311)
        R.vm.kill_object(alloc2308)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias4)
        gv3105: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1225: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2311, gv3105, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2311)
        gv3106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2312: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3106, R.dtype("float16"))
        cls.concatenate1(reshape1223, reshape1224, reshape1225, alloc2312)
        R.vm.kill_object(reshape1223)
        R.vm.kill_object(reshape1224)
        R.vm.kill_object(reshape1225)
        gv3107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1226: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2312, gv3107, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2312)
        gv3108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3108, R.dtype("float16"))
        _2312: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1226, alloc2313)
        R.vm.kill_object(reshape1226)
        gv3109: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1227: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2313, gv3109, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2313)
        gv3110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1228: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1227, gv3110, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1227)
        model_decoder_layers_19_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[950]
        model_decoder_layers_19_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[951]
        gv3111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3111, R.dtype("float16"))
        _2313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight4, reshape1228, model_decoder_layers_19_self_attn_out_proj_bias4, alloc2314)
        R.vm.kill_object(reshape1228)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias4)
        gv3112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3112, R.dtype("float16"))
        cls.add5(alloc2307, alloc2314, alloc2315)
        R.vm.kill_object(alloc2307)
        R.vm.kill_object(alloc2314)
        model_decoder_layers_19_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[961]
        model_decoder_layers_19_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[962]
        gv3113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3113, R.dtype("float16"))
        cls.layer_norm2(alloc2315, model_decoder_layers_19_encoder_attn_layer_norm_weight4, model_decoder_layers_19_encoder_attn_layer_norm_bias4, alloc2316)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias4)
        model_decoder_layers_19_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[957]
        model_decoder_layers_19_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[958]
        gv3114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2317: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3114, R.dtype("float16"))
        _2316: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight4, alloc2316, model_decoder_layers_19_encoder_attn_q_proj_bias4, alloc2317)
        R.vm.kill_object(alloc2316)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias4)
        gv3115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1229: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2317, gv3115, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2317)
        gv3116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1230: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1229, gv3116, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1229)
        gv3117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3117, R.dtype("float16"))
        _2317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1230, alloc2318)
        R.vm.kill_object(reshape1230)
        gv3118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1231: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2318, gv3118, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2318)
        gv3119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1232: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1231, gv3119, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1231)
        model_decoder_layers_19_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[959]
        model_decoder_layers_19_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[960]
        gv3120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3120, R.dtype("float16"))
        _2318: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight4, reshape1232, model_decoder_layers_19_encoder_attn_out_proj_bias4, alloc2319)
        R.vm.kill_object(reshape1232)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias4)
        gv3121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3121, R.dtype("float16"))
        cls.add5(alloc2315, alloc2319, alloc2320)
        R.vm.kill_object(alloc2315)
        R.vm.kill_object(alloc2319)
        model_decoder_layers_19_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[967]
        model_decoder_layers_19_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[968]
        gv3122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3122, R.dtype("float16"))
        cls.layer_norm2(alloc2320, model_decoder_layers_19_final_layer_norm_weight4, model_decoder_layers_19_final_layer_norm_bias4, alloc2321)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias4)
        model_decoder_layers_19_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[963]
        model_decoder_layers_19_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[964]
        gv3123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3123, R.dtype("float16"))
        _2321: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight4, alloc2321, model_decoder_layers_19_fc1_bias4, alloc2322)
        R.vm.kill_object(alloc2321)
        R.vm.kill_object(model_decoder_layers_19_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_19_fc1_bias4)
        model_decoder_layers_19_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[965]
        model_decoder_layers_19_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[966]
        gv3124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3124, R.dtype("float16"))
        _2322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight4, alloc2322, model_decoder_layers_19_fc2_bias4, alloc2323)
        R.vm.kill_object(alloc2322)
        R.vm.kill_object(model_decoder_layers_19_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_19_fc2_bias4)
        gv3125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3125, R.dtype("float16"))
        cls.add5(alloc2320, alloc2323, alloc2324)
        R.vm.kill_object(alloc2320)
        R.vm.kill_object(alloc2323)
        model_decoder_layers_20_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[976]
        model_decoder_layers_20_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[977]
        gv3126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3126, R.dtype("float16"))
        cls.layer_norm2(alloc2324, model_decoder_layers_20_self_attn_layer_norm_weight4, model_decoder_layers_20_self_attn_layer_norm_bias4, alloc2325)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias4)
        model_decoder_layers_20_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[972]
        model_decoder_layers_20_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[973]
        gv3127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3127, R.dtype("float16"))
        _2325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_q_proj_bias4, alloc2326)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias4)
        gv3128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1233: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2326, gv3128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2326)
        model_decoder_layers_20_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[969]
        gv3129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3129, R.dtype("float16"))
        _2326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight4, alloc2325, alloc2327)
        R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight4)
        gv3130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1234: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2327, gv3130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2327)
        model_decoder_layers_20_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[970]
        model_decoder_layers_20_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[971]
        gv3131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3131, R.dtype("float16"))
        _2327: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_v_proj_bias4, alloc2328)
        R.vm.kill_object(alloc2325)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias4)
        gv3132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1235: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2328, gv3132, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2328)
        gv3133: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2329: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3133, R.dtype("float16"))
        cls.concatenate1(reshape1233, reshape1234, reshape1235, alloc2329)
        R.vm.kill_object(reshape1233)
        R.vm.kill_object(reshape1234)
        R.vm.kill_object(reshape1235)
        gv3134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1236: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2329, gv3134, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2329)
        gv3135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3135, R.dtype("float16"))
        _2329: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1236, alloc2330)
        R.vm.kill_object(reshape1236)
        gv3136: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1237: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2330, gv3136, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2330)
        gv3137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1238: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1237, gv3137, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1237)
        model_decoder_layers_20_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[974]
        model_decoder_layers_20_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[975]
        gv3138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3138, R.dtype("float16"))
        _2330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight4, reshape1238, model_decoder_layers_20_self_attn_out_proj_bias4, alloc2331)
        R.vm.kill_object(reshape1238)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias4)
        gv3139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3139, R.dtype("float16"))
        cls.add5(alloc2324, alloc2331, alloc2332)
        R.vm.kill_object(alloc2324)
        R.vm.kill_object(alloc2331)
        model_decoder_layers_20_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[985]
        model_decoder_layers_20_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[986]
        gv3140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3140, R.dtype("float16"))
        cls.layer_norm2(alloc2332, model_decoder_layers_20_encoder_attn_layer_norm_weight4, model_decoder_layers_20_encoder_attn_layer_norm_bias4, alloc2333)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias4)
        model_decoder_layers_20_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[981]
        model_decoder_layers_20_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[982]
        gv3141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2334: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3141, R.dtype("float16"))
        _2333: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight4, alloc2333, model_decoder_layers_20_encoder_attn_q_proj_bias4, alloc2334)
        R.vm.kill_object(alloc2333)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias4)
        gv3142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1239: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2334, gv3142, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2334)
        gv3143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1240: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1239, gv3143, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1239)
        gv3144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3144, R.dtype("float16"))
        _2334: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1240, alloc2335)
        R.vm.kill_object(reshape1240)
        gv3145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1241: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2335, gv3145, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2335)
        gv3146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1242: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1241, gv3146, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1241)
        model_decoder_layers_20_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[983]
        model_decoder_layers_20_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[984]
        gv3147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3147, R.dtype("float16"))
        _2335: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight4, reshape1242, model_decoder_layers_20_encoder_attn_out_proj_bias4, alloc2336)
        R.vm.kill_object(reshape1242)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias4)
        gv3148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3148, R.dtype("float16"))
        cls.add5(alloc2332, alloc2336, alloc2337)
        R.vm.kill_object(alloc2332)
        R.vm.kill_object(alloc2336)
        model_decoder_layers_20_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[991]
        model_decoder_layers_20_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[992]
        gv3149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3149, R.dtype("float16"))
        cls.layer_norm2(alloc2337, model_decoder_layers_20_final_layer_norm_weight4, model_decoder_layers_20_final_layer_norm_bias4, alloc2338)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias4)
        model_decoder_layers_20_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[987]
        model_decoder_layers_20_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[988]
        gv3150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3150, R.dtype("float16"))
        _2338: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight4, alloc2338, model_decoder_layers_20_fc1_bias4, alloc2339)
        R.vm.kill_object(alloc2338)
        R.vm.kill_object(model_decoder_layers_20_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_20_fc1_bias4)
        model_decoder_layers_20_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[989]
        model_decoder_layers_20_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[990]
        gv3151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3151, R.dtype("float16"))
        _2339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight4, alloc2339, model_decoder_layers_20_fc2_bias4, alloc2340)
        R.vm.kill_object(alloc2339)
        R.vm.kill_object(model_decoder_layers_20_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_20_fc2_bias4)
        gv3152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3152, R.dtype("float16"))
        cls.add5(alloc2337, alloc2340, alloc2341)
        R.vm.kill_object(alloc2337)
        R.vm.kill_object(alloc2340)
        model_decoder_layers_21_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1000]
        model_decoder_layers_21_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1001]
        gv3153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3153, R.dtype("float16"))
        cls.layer_norm2(alloc2341, model_decoder_layers_21_self_attn_layer_norm_weight4, model_decoder_layers_21_self_attn_layer_norm_bias4, alloc2342)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias4)
        model_decoder_layers_21_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[996]
        model_decoder_layers_21_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[997]
        gv3154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3154, R.dtype("float16"))
        _2342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_q_proj_bias4, alloc2343)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias4)
        gv3155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1243: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2343, gv3155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2343)
        model_decoder_layers_21_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[993]
        gv3156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3156, R.dtype("float16"))
        _2343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight4, alloc2342, alloc2344)
        R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight4)
        gv3157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1244: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2344, gv3157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2344)
        model_decoder_layers_21_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[994]
        model_decoder_layers_21_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[995]
        gv3158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3158, R.dtype("float16"))
        _2344: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_v_proj_bias4, alloc2345)
        R.vm.kill_object(alloc2342)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias4)
        gv3159: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1245: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2345, gv3159, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2345)
        gv3160: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2346: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3160, R.dtype("float16"))
        cls.concatenate1(reshape1243, reshape1244, reshape1245, alloc2346)
        R.vm.kill_object(reshape1243)
        R.vm.kill_object(reshape1244)
        R.vm.kill_object(reshape1245)
        gv3161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1246: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2346, gv3161, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2346)
        gv3162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3162, R.dtype("float16"))
        _2346: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1246, alloc2347)
        R.vm.kill_object(reshape1246)
        gv3163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1247: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2347, gv3163, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2347)
        gv3164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1248: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1247, gv3164, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1247)
        model_decoder_layers_21_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[998]
        model_decoder_layers_21_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[999]
        gv3165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3165, R.dtype("float16"))
        _2347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight4, reshape1248, model_decoder_layers_21_self_attn_out_proj_bias4, alloc2348)
        R.vm.kill_object(reshape1248)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias4)
        gv3166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3166, R.dtype("float16"))
        cls.add5(alloc2341, alloc2348, alloc2349)
        R.vm.kill_object(alloc2341)
        R.vm.kill_object(alloc2348)
        model_decoder_layers_21_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1009]
        model_decoder_layers_21_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1010]
        gv3167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3167, R.dtype("float16"))
        cls.layer_norm2(alloc2349, model_decoder_layers_21_encoder_attn_layer_norm_weight4, model_decoder_layers_21_encoder_attn_layer_norm_bias4, alloc2350)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias4)
        model_decoder_layers_21_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005]
        model_decoder_layers_21_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1006]
        gv3168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2351: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3168, R.dtype("float16"))
        _2350: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight4, alloc2350, model_decoder_layers_21_encoder_attn_q_proj_bias4, alloc2351)
        R.vm.kill_object(alloc2350)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias4)
        gv3169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1249: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2351, gv3169, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2351)
        gv3170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1250: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1249, gv3170, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1249)
        gv3171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3171, R.dtype("float16"))
        _2351: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1250, alloc2352)
        R.vm.kill_object(reshape1250)
        gv3172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1251: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2352, gv3172, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2352)
        gv3173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1252: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1251, gv3173, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1251)
        model_decoder_layers_21_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007]
        model_decoder_layers_21_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1008]
        gv3174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3174, R.dtype("float16"))
        _2352: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight4, reshape1252, model_decoder_layers_21_encoder_attn_out_proj_bias4, alloc2353)
        R.vm.kill_object(reshape1252)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias4)
        gv3175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3175, R.dtype("float16"))
        cls.add5(alloc2349, alloc2353, alloc2354)
        R.vm.kill_object(alloc2349)
        R.vm.kill_object(alloc2353)
        model_decoder_layers_21_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1015]
        model_decoder_layers_21_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1016]
        gv3176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3176, R.dtype("float16"))
        cls.layer_norm2(alloc2354, model_decoder_layers_21_final_layer_norm_weight4, model_decoder_layers_21_final_layer_norm_bias4, alloc2355)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias4)
        model_decoder_layers_21_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011]
        model_decoder_layers_21_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1012]
        gv3177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3177, R.dtype("float16"))
        _2355: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight4, alloc2355, model_decoder_layers_21_fc1_bias4, alloc2356)
        R.vm.kill_object(alloc2355)
        R.vm.kill_object(model_decoder_layers_21_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_21_fc1_bias4)
        model_decoder_layers_21_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013]
        model_decoder_layers_21_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1014]
        gv3178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3178, R.dtype("float16"))
        _2356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight4, alloc2356, model_decoder_layers_21_fc2_bias4, alloc2357)
        R.vm.kill_object(alloc2356)
        R.vm.kill_object(model_decoder_layers_21_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_21_fc2_bias4)
        gv3179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3179, R.dtype("float16"))
        cls.add5(alloc2354, alloc2357, alloc2358)
        R.vm.kill_object(alloc2354)
        R.vm.kill_object(alloc2357)
        model_decoder_layers_22_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1024]
        model_decoder_layers_22_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1025]
        gv3180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3180, R.dtype("float16"))
        cls.layer_norm2(alloc2358, model_decoder_layers_22_self_attn_layer_norm_weight4, model_decoder_layers_22_self_attn_layer_norm_bias4, alloc2359)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias4)
        model_decoder_layers_22_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020]
        model_decoder_layers_22_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1021]
        gv3181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3181, R.dtype("float16"))
        _2359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_q_proj_bias4, alloc2360)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias4)
        gv3182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1253: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2360, gv3182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2360)
        model_decoder_layers_22_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017]
        gv3183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3183, R.dtype("float16"))
        _2360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight4, alloc2359, alloc2361)
        R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight4)
        gv3184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1254: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2361, gv3184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2361)
        model_decoder_layers_22_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018]
        model_decoder_layers_22_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1019]
        gv3185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3185, R.dtype("float16"))
        _2361: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_v_proj_bias4, alloc2362)
        R.vm.kill_object(alloc2359)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias4)
        gv3186: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1255: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2362, gv3186, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2362)
        gv3187: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2363: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3187, R.dtype("float16"))
        cls.concatenate1(reshape1253, reshape1254, reshape1255, alloc2363)
        R.vm.kill_object(reshape1253)
        R.vm.kill_object(reshape1254)
        R.vm.kill_object(reshape1255)
        gv3188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1256: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2363, gv3188, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2363)
        gv3189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3189, R.dtype("float16"))
        _2363: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1256, alloc2364)
        R.vm.kill_object(reshape1256)
        gv3190: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1257: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2364, gv3190, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2364)
        gv3191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1258: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1257, gv3191, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1257)
        model_decoder_layers_22_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022]
        model_decoder_layers_22_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1023]
        gv3192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3192, R.dtype("float16"))
        _2364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight4, reshape1258, model_decoder_layers_22_self_attn_out_proj_bias4, alloc2365)
        R.vm.kill_object(reshape1258)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias4)
        gv3193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3193, R.dtype("float16"))
        cls.add5(alloc2358, alloc2365, alloc2366)
        R.vm.kill_object(alloc2358)
        R.vm.kill_object(alloc2365)
        model_decoder_layers_22_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1033]
        model_decoder_layers_22_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1034]
        gv3194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3194, R.dtype("float16"))
        cls.layer_norm2(alloc2366, model_decoder_layers_22_encoder_attn_layer_norm_weight4, model_decoder_layers_22_encoder_attn_layer_norm_bias4, alloc2367)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias4)
        model_decoder_layers_22_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029]
        model_decoder_layers_22_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1030]
        gv3195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2368: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3195, R.dtype("float16"))
        _2367: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight4, alloc2367, model_decoder_layers_22_encoder_attn_q_proj_bias4, alloc2368)
        R.vm.kill_object(alloc2367)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias4)
        gv3196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1259: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2368, gv3196, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2368)
        gv3197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1260: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1259, gv3197, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1259)
        gv3198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3198, R.dtype("float16"))
        _2368: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1260, alloc2369)
        R.vm.kill_object(reshape1260)
        gv3199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1261: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2369, gv3199, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2369)
        gv3200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1262: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1261, gv3200, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1261)
        model_decoder_layers_22_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031]
        model_decoder_layers_22_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1032]
        gv3201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3201, R.dtype("float16"))
        _2369: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight4, reshape1262, model_decoder_layers_22_encoder_attn_out_proj_bias4, alloc2370)
        R.vm.kill_object(reshape1262)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias4)
        gv3202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3202, R.dtype("float16"))
        cls.add5(alloc2366, alloc2370, alloc2371)
        R.vm.kill_object(alloc2366)
        R.vm.kill_object(alloc2370)
        model_decoder_layers_22_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1039]
        model_decoder_layers_22_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1040]
        gv3203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3203, R.dtype("float16"))
        cls.layer_norm2(alloc2371, model_decoder_layers_22_final_layer_norm_weight4, model_decoder_layers_22_final_layer_norm_bias4, alloc2372)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias4)
        model_decoder_layers_22_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035]
        model_decoder_layers_22_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1036]
        gv3204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3204, R.dtype("float16"))
        _2372: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight4, alloc2372, model_decoder_layers_22_fc1_bias4, alloc2373)
        R.vm.kill_object(alloc2372)
        R.vm.kill_object(model_decoder_layers_22_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_22_fc1_bias4)
        model_decoder_layers_22_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037]
        model_decoder_layers_22_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1038]
        gv3205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3205, R.dtype("float16"))
        _2373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight4, alloc2373, model_decoder_layers_22_fc2_bias4, alloc2374)
        R.vm.kill_object(alloc2373)
        R.vm.kill_object(model_decoder_layers_22_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_22_fc2_bias4)
        gv3206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3206, R.dtype("float16"))
        cls.add5(alloc2371, alloc2374, alloc2375)
        R.vm.kill_object(alloc2371)
        R.vm.kill_object(alloc2374)
        model_decoder_layers_23_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1048]
        model_decoder_layers_23_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1049]
        gv3207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3207, R.dtype("float16"))
        cls.layer_norm2(alloc2375, model_decoder_layers_23_self_attn_layer_norm_weight4, model_decoder_layers_23_self_attn_layer_norm_bias4, alloc2376)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias4)
        model_decoder_layers_23_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044]
        model_decoder_layers_23_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1045]
        gv3208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3208, R.dtype("float16"))
        _2376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_q_proj_bias4, alloc2377)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias4)
        gv3209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1263: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2377, gv3209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2377)
        model_decoder_layers_23_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041]
        gv3210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3210, R.dtype("float16"))
        _2377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight4, alloc2376, alloc2378)
        R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight4)
        gv3211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1264: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2378, gv3211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2378)
        model_decoder_layers_23_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042]
        model_decoder_layers_23_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1043]
        gv3212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3212, R.dtype("float16"))
        _2378: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_v_proj_bias4, alloc2379)
        R.vm.kill_object(alloc2376)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias4)
        gv3213: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1265: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2379, gv3213, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2379)
        gv3214: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2380: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3214, R.dtype("float16"))
        cls.concatenate1(reshape1263, reshape1264, reshape1265, alloc2380)
        R.vm.kill_object(reshape1263)
        R.vm.kill_object(reshape1264)
        R.vm.kill_object(reshape1265)
        gv3215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1266: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2380, gv3215, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2380)
        gv3216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3216, R.dtype("float16"))
        _2380: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1266, alloc2381)
        R.vm.kill_object(reshape1266)
        gv3217: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1267: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2381, gv3217, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2381)
        gv3218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1268: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1267, gv3218, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1267)
        model_decoder_layers_23_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046]
        model_decoder_layers_23_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1047]
        gv3219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3219, R.dtype("float16"))
        _2381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight4, reshape1268, model_decoder_layers_23_self_attn_out_proj_bias4, alloc2382)
        R.vm.kill_object(reshape1268)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias4)
        gv3220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3220, R.dtype("float16"))
        cls.add5(alloc2375, alloc2382, alloc2383)
        R.vm.kill_object(alloc2375)
        R.vm.kill_object(alloc2382)
        model_decoder_layers_23_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1057]
        model_decoder_layers_23_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1058]
        gv3221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3221, R.dtype("float16"))
        cls.layer_norm2(alloc2383, model_decoder_layers_23_encoder_attn_layer_norm_weight4, model_decoder_layers_23_encoder_attn_layer_norm_bias4, alloc2384)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias4)
        model_decoder_layers_23_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053]
        model_decoder_layers_23_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1054]
        gv3222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2385: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3222, R.dtype("float16"))
        _2384: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight4, alloc2384, model_decoder_layers_23_encoder_attn_q_proj_bias4, alloc2385)
        R.vm.kill_object(alloc2384)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias4)
        gv3223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1269: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2385, gv3223, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2385)
        gv3224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1270: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1269, gv3224, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1269)
        gv3225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3225, R.dtype("float16"))
        _2385: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1270, alloc2386)
        R.vm.kill_object(reshape1270)
        gv3226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1271: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2386, gv3226, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2386)
        gv3227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1272: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1271, gv3227, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1271)
        model_decoder_layers_23_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055]
        model_decoder_layers_23_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1056]
        gv3228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3228, R.dtype("float16"))
        _2386: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight4, reshape1272, model_decoder_layers_23_encoder_attn_out_proj_bias4, alloc2387)
        R.vm.kill_object(reshape1272)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias4)
        gv3229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3229, R.dtype("float16"))
        cls.add5(alloc2383, alloc2387, alloc2388)
        R.vm.kill_object(alloc2383)
        R.vm.kill_object(alloc2387)
        model_decoder_layers_23_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1063]
        model_decoder_layers_23_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1064]
        gv3230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3230, R.dtype("float16"))
        cls.layer_norm2(alloc2388, model_decoder_layers_23_final_layer_norm_weight4, model_decoder_layers_23_final_layer_norm_bias4, alloc2389)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias4)
        model_decoder_layers_23_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059]
        model_decoder_layers_23_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1060]
        gv3231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3231, R.dtype("float16"))
        _2389: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight4, alloc2389, model_decoder_layers_23_fc1_bias4, alloc2390)
        R.vm.kill_object(alloc2389)
        R.vm.kill_object(model_decoder_layers_23_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_23_fc1_bias4)
        model_decoder_layers_23_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061]
        model_decoder_layers_23_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1062]
        gv3232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3232, R.dtype("float16"))
        _2390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight4, alloc2390, model_decoder_layers_23_fc2_bias4, alloc2391)
        R.vm.kill_object(alloc2390)
        R.vm.kill_object(model_decoder_layers_23_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_23_fc2_bias4)
        gv3233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3233, R.dtype("float16"))
        cls.add5(alloc2388, alloc2391, alloc2392)
        R.vm.kill_object(alloc2388)
        R.vm.kill_object(alloc2391)
        model_decoder_layers_24_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1072]
        model_decoder_layers_24_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1073]
        gv3234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3234, R.dtype("float16"))
        cls.layer_norm2(alloc2392, model_decoder_layers_24_self_attn_layer_norm_weight4, model_decoder_layers_24_self_attn_layer_norm_bias4, alloc2393)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias4)
        model_decoder_layers_24_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068]
        model_decoder_layers_24_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1069]
        gv3235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3235, R.dtype("float16"))
        _2393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_q_proj_bias4, alloc2394)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias4)
        gv3236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1273: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2394, gv3236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2394)
        model_decoder_layers_24_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065]
        gv3237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3237, R.dtype("float16"))
        _2394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight4, alloc2393, alloc2395)
        R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight4)
        gv3238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1274: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2395, gv3238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2395)
        model_decoder_layers_24_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066]
        model_decoder_layers_24_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1067]
        gv3239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3239, R.dtype("float16"))
        _2395: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_v_proj_bias4, alloc2396)
        R.vm.kill_object(alloc2393)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias4)
        gv3240: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1275: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2396, gv3240, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2396)
        gv3241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2397: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3241, R.dtype("float16"))
        cls.concatenate1(reshape1273, reshape1274, reshape1275, alloc2397)
        R.vm.kill_object(reshape1273)
        R.vm.kill_object(reshape1274)
        R.vm.kill_object(reshape1275)
        gv3242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1276: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2397, gv3242, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2397)
        gv3243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3243, R.dtype("float16"))
        _2397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1276, alloc2398)
        R.vm.kill_object(reshape1276)
        gv3244: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1277: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2398, gv3244, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2398)
        gv3245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1278: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1277, gv3245, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1277)
        model_decoder_layers_24_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070]
        model_decoder_layers_24_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1071]
        gv3246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3246, R.dtype("float16"))
        _2398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight4, reshape1278, model_decoder_layers_24_self_attn_out_proj_bias4, alloc2399)
        R.vm.kill_object(reshape1278)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias4)
        gv3247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3247, R.dtype("float16"))
        cls.add5(alloc2392, alloc2399, alloc2400)
        R.vm.kill_object(alloc2392)
        R.vm.kill_object(alloc2399)
        model_decoder_layers_24_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1081]
        model_decoder_layers_24_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1082]
        gv3248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3248, R.dtype("float16"))
        cls.layer_norm2(alloc2400, model_decoder_layers_24_encoder_attn_layer_norm_weight4, model_decoder_layers_24_encoder_attn_layer_norm_bias4, alloc2401)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias4)
        model_decoder_layers_24_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077]
        model_decoder_layers_24_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1078]
        gv3249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2402: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3249, R.dtype("float16"))
        _2401: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight4, alloc2401, model_decoder_layers_24_encoder_attn_q_proj_bias4, alloc2402)
        R.vm.kill_object(alloc2401)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias4)
        gv3250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1279: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2402, gv3250, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2402)
        gv3251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1280: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1279, gv3251, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1279)
        gv3252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3252, R.dtype("float16"))
        _2402: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1280, alloc2403)
        R.vm.kill_object(reshape1280)
        gv3253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1281: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2403, gv3253, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2403)
        gv3254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1282: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1281, gv3254, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1281)
        model_decoder_layers_24_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079]
        model_decoder_layers_24_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1080]
        gv3255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3255, R.dtype("float16"))
        _2403: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight4, reshape1282, model_decoder_layers_24_encoder_attn_out_proj_bias4, alloc2404)
        R.vm.kill_object(reshape1282)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias4)
        gv3256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3256, R.dtype("float16"))
        cls.add5(alloc2400, alloc2404, alloc2405)
        R.vm.kill_object(alloc2400)
        R.vm.kill_object(alloc2404)
        model_decoder_layers_24_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1087]
        model_decoder_layers_24_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1088]
        gv3257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3257, R.dtype("float16"))
        cls.layer_norm2(alloc2405, model_decoder_layers_24_final_layer_norm_weight4, model_decoder_layers_24_final_layer_norm_bias4, alloc2406)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias4)
        model_decoder_layers_24_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083]
        model_decoder_layers_24_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1084]
        gv3258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3258, R.dtype("float16"))
        _2406: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight4, alloc2406, model_decoder_layers_24_fc1_bias4, alloc2407)
        R.vm.kill_object(alloc2406)
        R.vm.kill_object(model_decoder_layers_24_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_24_fc1_bias4)
        model_decoder_layers_24_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085]
        model_decoder_layers_24_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1086]
        gv3259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3259, R.dtype("float16"))
        _2407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight4, alloc2407, model_decoder_layers_24_fc2_bias4, alloc2408)
        R.vm.kill_object(alloc2407)
        R.vm.kill_object(model_decoder_layers_24_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_24_fc2_bias4)
        gv3260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3260, R.dtype("float16"))
        cls.add5(alloc2405, alloc2408, alloc2409)
        R.vm.kill_object(alloc2405)
        R.vm.kill_object(alloc2408)
        model_decoder_layers_25_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1096]
        model_decoder_layers_25_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1097]
        gv3261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3261, R.dtype("float16"))
        cls.layer_norm2(alloc2409, model_decoder_layers_25_self_attn_layer_norm_weight4, model_decoder_layers_25_self_attn_layer_norm_bias4, alloc2410)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias4)
        model_decoder_layers_25_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092]
        model_decoder_layers_25_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1093]
        gv3262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3262, R.dtype("float16"))
        _2410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_q_proj_bias4, alloc2411)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias4)
        gv3263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1283: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2411, gv3263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2411)
        model_decoder_layers_25_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089]
        gv3264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3264, R.dtype("float16"))
        _2411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight4, alloc2410, alloc2412)
        R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight4)
        gv3265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1284: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2412, gv3265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2412)
        model_decoder_layers_25_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090]
        model_decoder_layers_25_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1091]
        gv3266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3266, R.dtype("float16"))
        _2412: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_v_proj_bias4, alloc2413)
        R.vm.kill_object(alloc2410)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias4)
        gv3267: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1285: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2413, gv3267, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2413)
        gv3268: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2414: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3268, R.dtype("float16"))
        cls.concatenate1(reshape1283, reshape1284, reshape1285, alloc2414)
        R.vm.kill_object(reshape1283)
        R.vm.kill_object(reshape1284)
        R.vm.kill_object(reshape1285)
        gv3269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1286: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2414, gv3269, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2414)
        gv3270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3270, R.dtype("float16"))
        _2414: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1286, alloc2415)
        R.vm.kill_object(reshape1286)
        gv3271: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1287: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2415, gv3271, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2415)
        gv3272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1288: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1287, gv3272, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1287)
        model_decoder_layers_25_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094]
        model_decoder_layers_25_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1095]
        gv3273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3273, R.dtype("float16"))
        _2415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight4, reshape1288, model_decoder_layers_25_self_attn_out_proj_bias4, alloc2416)
        R.vm.kill_object(reshape1288)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias4)
        gv3274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3274, R.dtype("float16"))
        cls.add5(alloc2409, alloc2416, alloc2417)
        R.vm.kill_object(alloc2409)
        R.vm.kill_object(alloc2416)
        model_decoder_layers_25_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1105]
        model_decoder_layers_25_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1106]
        gv3275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3275, R.dtype("float16"))
        cls.layer_norm2(alloc2417, model_decoder_layers_25_encoder_attn_layer_norm_weight4, model_decoder_layers_25_encoder_attn_layer_norm_bias4, alloc2418)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias4)
        model_decoder_layers_25_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101]
        model_decoder_layers_25_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1102]
        gv3276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2419: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3276, R.dtype("float16"))
        _2418: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight4, alloc2418, model_decoder_layers_25_encoder_attn_q_proj_bias4, alloc2419)
        R.vm.kill_object(alloc2418)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias4)
        gv3277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1289: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2419, gv3277, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2419)
        gv3278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1290: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1289, gv3278, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1289)
        gv3279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3279, R.dtype("float16"))
        _2419: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1290, alloc2420)
        R.vm.kill_object(reshape1290)
        gv3280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1291: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2420, gv3280, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2420)
        gv3281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1292: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1291, gv3281, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1291)
        model_decoder_layers_25_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103]
        model_decoder_layers_25_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1104]
        gv3282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3282, R.dtype("float16"))
        _2420: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight4, reshape1292, model_decoder_layers_25_encoder_attn_out_proj_bias4, alloc2421)
        R.vm.kill_object(reshape1292)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias4)
        gv3283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3283, R.dtype("float16"))
        cls.add5(alloc2417, alloc2421, alloc2422)
        R.vm.kill_object(alloc2417)
        R.vm.kill_object(alloc2421)
        model_decoder_layers_25_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1111]
        model_decoder_layers_25_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1112]
        gv3284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3284, R.dtype("float16"))
        cls.layer_norm2(alloc2422, model_decoder_layers_25_final_layer_norm_weight4, model_decoder_layers_25_final_layer_norm_bias4, alloc2423)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias4)
        model_decoder_layers_25_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107]
        model_decoder_layers_25_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1108]
        gv3285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3285, R.dtype("float16"))
        _2423: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight4, alloc2423, model_decoder_layers_25_fc1_bias4, alloc2424)
        R.vm.kill_object(alloc2423)
        R.vm.kill_object(model_decoder_layers_25_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_25_fc1_bias4)
        model_decoder_layers_25_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109]
        model_decoder_layers_25_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1110]
        gv3286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3286, R.dtype("float16"))
        _2424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight4, alloc2424, model_decoder_layers_25_fc2_bias4, alloc2425)
        R.vm.kill_object(alloc2424)
        R.vm.kill_object(model_decoder_layers_25_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_25_fc2_bias4)
        gv3287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3287, R.dtype("float16"))
        cls.add5(alloc2422, alloc2425, alloc2426)
        R.vm.kill_object(alloc2422)
        R.vm.kill_object(alloc2425)
        model_decoder_layers_26_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1120]
        model_decoder_layers_26_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1121]
        gv3288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3288, R.dtype("float16"))
        cls.layer_norm2(alloc2426, model_decoder_layers_26_self_attn_layer_norm_weight4, model_decoder_layers_26_self_attn_layer_norm_bias4, alloc2427)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias4)
        model_decoder_layers_26_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116]
        model_decoder_layers_26_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1117]
        gv3289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3289, R.dtype("float16"))
        _2427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_q_proj_bias4, alloc2428)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias4)
        gv3290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1293: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2428, gv3290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2428)
        model_decoder_layers_26_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113]
        gv3291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3291, R.dtype("float16"))
        _2428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight4, alloc2427, alloc2429)
        R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight4)
        gv3292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1294: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2429, gv3292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2429)
        model_decoder_layers_26_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114]
        model_decoder_layers_26_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1115]
        gv3293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3293, R.dtype("float16"))
        _2429: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_v_proj_bias4, alloc2430)
        R.vm.kill_object(alloc2427)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias4)
        gv3294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1295: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2430, gv3294, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2430)
        gv3295: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2431: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3295, R.dtype("float16"))
        cls.concatenate1(reshape1293, reshape1294, reshape1295, alloc2431)
        R.vm.kill_object(reshape1293)
        R.vm.kill_object(reshape1294)
        R.vm.kill_object(reshape1295)
        gv3296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1296: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2431, gv3296, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2431)
        gv3297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3297, R.dtype("float16"))
        _2431: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1296, alloc2432)
        R.vm.kill_object(reshape1296)
        gv3298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1297: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2432, gv3298, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2432)
        gv3299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1298: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1297, gv3299, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1297)
        model_decoder_layers_26_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118]
        model_decoder_layers_26_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1119]
        gv3300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3300, R.dtype("float16"))
        _2432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight4, reshape1298, model_decoder_layers_26_self_attn_out_proj_bias4, alloc2433)
        R.vm.kill_object(reshape1298)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias4)
        gv3301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3301, R.dtype("float16"))
        cls.add5(alloc2426, alloc2433, alloc2434)
        R.vm.kill_object(alloc2426)
        R.vm.kill_object(alloc2433)
        model_decoder_layers_26_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1129]
        model_decoder_layers_26_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1130]
        gv3302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3302, R.dtype("float16"))
        cls.layer_norm2(alloc2434, model_decoder_layers_26_encoder_attn_layer_norm_weight4, model_decoder_layers_26_encoder_attn_layer_norm_bias4, alloc2435)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias4)
        model_decoder_layers_26_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125]
        model_decoder_layers_26_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1126]
        gv3303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2436: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3303, R.dtype("float16"))
        _2435: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight4, alloc2435, model_decoder_layers_26_encoder_attn_q_proj_bias4, alloc2436)
        R.vm.kill_object(alloc2435)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias4)
        gv3304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1299: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2436, gv3304, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2436)
        gv3305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1300: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1299, gv3305, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1299)
        gv3306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3306, R.dtype("float16"))
        _2436: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1300, alloc2437)
        R.vm.kill_object(reshape1300)
        gv3307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1301: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2437, gv3307, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2437)
        gv3308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1302: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1301, gv3308, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1301)
        model_decoder_layers_26_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127]
        model_decoder_layers_26_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1128]
        gv3309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3309, R.dtype("float16"))
        _2437: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight4, reshape1302, model_decoder_layers_26_encoder_attn_out_proj_bias4, alloc2438)
        R.vm.kill_object(reshape1302)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias4)
        gv3310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3310, R.dtype("float16"))
        cls.add5(alloc2434, alloc2438, alloc2439)
        R.vm.kill_object(alloc2434)
        R.vm.kill_object(alloc2438)
        model_decoder_layers_26_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1135]
        model_decoder_layers_26_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1136]
        gv3311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3311, R.dtype("float16"))
        cls.layer_norm2(alloc2439, model_decoder_layers_26_final_layer_norm_weight4, model_decoder_layers_26_final_layer_norm_bias4, alloc2440)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias4)
        model_decoder_layers_26_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131]
        model_decoder_layers_26_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1132]
        gv3312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3312, R.dtype("float16"))
        _2440: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight4, alloc2440, model_decoder_layers_26_fc1_bias4, alloc2441)
        R.vm.kill_object(alloc2440)
        R.vm.kill_object(model_decoder_layers_26_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_26_fc1_bias4)
        model_decoder_layers_26_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133]
        model_decoder_layers_26_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1134]
        gv3313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3313, R.dtype("float16"))
        _2441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight4, alloc2441, model_decoder_layers_26_fc2_bias4, alloc2442)
        R.vm.kill_object(alloc2441)
        R.vm.kill_object(model_decoder_layers_26_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_26_fc2_bias4)
        gv3314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3314, R.dtype("float16"))
        cls.add5(alloc2439, alloc2442, alloc2443)
        R.vm.kill_object(alloc2439)
        R.vm.kill_object(alloc2442)
        model_decoder_layers_27_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1144]
        model_decoder_layers_27_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1145]
        gv3315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3315, R.dtype("float16"))
        cls.layer_norm2(alloc2443, model_decoder_layers_27_self_attn_layer_norm_weight4, model_decoder_layers_27_self_attn_layer_norm_bias4, alloc2444)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias4)
        model_decoder_layers_27_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140]
        model_decoder_layers_27_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1141]
        gv3316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3316, R.dtype("float16"))
        _2444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_q_proj_bias4, alloc2445)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias4)
        gv3317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1303: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2445, gv3317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2445)
        model_decoder_layers_27_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137]
        gv3318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3318, R.dtype("float16"))
        _2445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight4, alloc2444, alloc2446)
        R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight4)
        gv3319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1304: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2446, gv3319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2446)
        model_decoder_layers_27_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138]
        model_decoder_layers_27_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1139]
        gv3320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3320, R.dtype("float16"))
        _2446: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_v_proj_bias4, alloc2447)
        R.vm.kill_object(alloc2444)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias4)
        gv3321: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1305: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2447, gv3321, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2447)
        gv3322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2448: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3322, R.dtype("float16"))
        cls.concatenate1(reshape1303, reshape1304, reshape1305, alloc2448)
        R.vm.kill_object(reshape1303)
        R.vm.kill_object(reshape1304)
        R.vm.kill_object(reshape1305)
        gv3323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1306: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2448, gv3323, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2448)
        gv3324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3324, R.dtype("float16"))
        _2448: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1306, alloc2449)
        R.vm.kill_object(reshape1306)
        gv3325: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1307: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2449, gv3325, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2449)
        gv3326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1308: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1307, gv3326, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1307)
        model_decoder_layers_27_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142]
        model_decoder_layers_27_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1143]
        gv3327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3327, R.dtype("float16"))
        _2449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight4, reshape1308, model_decoder_layers_27_self_attn_out_proj_bias4, alloc2450)
        R.vm.kill_object(reshape1308)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias4)
        gv3328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3328, R.dtype("float16"))
        cls.add5(alloc2443, alloc2450, alloc2451)
        R.vm.kill_object(alloc2443)
        R.vm.kill_object(alloc2450)
        model_decoder_layers_27_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1153]
        model_decoder_layers_27_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1154]
        gv3329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3329, R.dtype("float16"))
        cls.layer_norm2(alloc2451, model_decoder_layers_27_encoder_attn_layer_norm_weight4, model_decoder_layers_27_encoder_attn_layer_norm_bias4, alloc2452)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias4)
        model_decoder_layers_27_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149]
        model_decoder_layers_27_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1150]
        gv3330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2453: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3330, R.dtype("float16"))
        _2452: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight4, alloc2452, model_decoder_layers_27_encoder_attn_q_proj_bias4, alloc2453)
        R.vm.kill_object(alloc2452)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias4)
        gv3331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1309: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2453, gv3331, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2453)
        gv3332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1310: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1309, gv3332, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1309)
        gv3333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3333, R.dtype("float16"))
        _2453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1310, alloc2454)
        R.vm.kill_object(reshape1310)
        gv3334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1311: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2454, gv3334, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2454)
        gv3335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1312: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1311, gv3335, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1311)
        model_decoder_layers_27_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151]
        model_decoder_layers_27_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1152]
        gv3336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3336, R.dtype("float16"))
        _2454: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight4, reshape1312, model_decoder_layers_27_encoder_attn_out_proj_bias4, alloc2455)
        R.vm.kill_object(reshape1312)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias4)
        gv3337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3337, R.dtype("float16"))
        cls.add5(alloc2451, alloc2455, alloc2456)
        R.vm.kill_object(alloc2451)
        R.vm.kill_object(alloc2455)
        model_decoder_layers_27_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1159]
        model_decoder_layers_27_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1160]
        gv3338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3338, R.dtype("float16"))
        cls.layer_norm2(alloc2456, model_decoder_layers_27_final_layer_norm_weight4, model_decoder_layers_27_final_layer_norm_bias4, alloc2457)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias4)
        model_decoder_layers_27_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155]
        model_decoder_layers_27_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1156]
        gv3339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3339, R.dtype("float16"))
        _2457: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight4, alloc2457, model_decoder_layers_27_fc1_bias4, alloc2458)
        R.vm.kill_object(alloc2457)
        R.vm.kill_object(model_decoder_layers_27_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_27_fc1_bias4)
        model_decoder_layers_27_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157]
        model_decoder_layers_27_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1158]
        gv3340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3340, R.dtype("float16"))
        _2458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight4, alloc2458, model_decoder_layers_27_fc2_bias4, alloc2459)
        R.vm.kill_object(alloc2458)
        R.vm.kill_object(model_decoder_layers_27_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_27_fc2_bias4)
        gv3341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3341, R.dtype("float16"))
        cls.add5(alloc2456, alloc2459, alloc2460)
        R.vm.kill_object(alloc2456)
        R.vm.kill_object(alloc2459)
        model_decoder_layers_28_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1168]
        model_decoder_layers_28_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1169]
        gv3342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3342, R.dtype("float16"))
        cls.layer_norm2(alloc2460, model_decoder_layers_28_self_attn_layer_norm_weight4, model_decoder_layers_28_self_attn_layer_norm_bias4, alloc2461)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias4)
        model_decoder_layers_28_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164]
        model_decoder_layers_28_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1165]
        gv3343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3343, R.dtype("float16"))
        _2461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_q_proj_bias4, alloc2462)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias4)
        gv3344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1313: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2462, gv3344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2462)
        model_decoder_layers_28_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161]
        gv3345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3345, R.dtype("float16"))
        _2462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight4, alloc2461, alloc2463)
        R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight4)
        gv3346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1314: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2463, gv3346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2463)
        model_decoder_layers_28_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162]
        model_decoder_layers_28_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1163]
        gv3347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3347, R.dtype("float16"))
        _2463: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_v_proj_bias4, alloc2464)
        R.vm.kill_object(alloc2461)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias4)
        gv3348: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1315: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2464, gv3348, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2464)
        gv3349: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2465: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3349, R.dtype("float16"))
        cls.concatenate1(reshape1313, reshape1314, reshape1315, alloc2465)
        R.vm.kill_object(reshape1313)
        R.vm.kill_object(reshape1314)
        R.vm.kill_object(reshape1315)
        gv3350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1316: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2465, gv3350, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2465)
        gv3351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3351, R.dtype("float16"))
        _2465: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1316, alloc2466)
        R.vm.kill_object(reshape1316)
        gv3352: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1317: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2466, gv3352, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2466)
        gv3353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1318: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1317, gv3353, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1317)
        model_decoder_layers_28_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166]
        model_decoder_layers_28_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1167]
        gv3354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3354, R.dtype("float16"))
        _2466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight4, reshape1318, model_decoder_layers_28_self_attn_out_proj_bias4, alloc2467)
        R.vm.kill_object(reshape1318)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias4)
        gv3355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3355, R.dtype("float16"))
        cls.add5(alloc2460, alloc2467, alloc2468)
        R.vm.kill_object(alloc2460)
        R.vm.kill_object(alloc2467)
        model_decoder_layers_28_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1177]
        model_decoder_layers_28_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1178]
        gv3356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3356, R.dtype("float16"))
        cls.layer_norm2(alloc2468, model_decoder_layers_28_encoder_attn_layer_norm_weight4, model_decoder_layers_28_encoder_attn_layer_norm_bias4, alloc2469)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias4)
        model_decoder_layers_28_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173]
        model_decoder_layers_28_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1174]
        gv3357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2470: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3357, R.dtype("float16"))
        _2469: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight4, alloc2469, model_decoder_layers_28_encoder_attn_q_proj_bias4, alloc2470)
        R.vm.kill_object(alloc2469)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias4)
        gv3358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1319: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2470, gv3358, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2470)
        gv3359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1320: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1319, gv3359, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1319)
        gv3360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3360, R.dtype("float16"))
        _2470: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1320, alloc2471)
        R.vm.kill_object(reshape1320)
        gv3361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1321: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2471, gv3361, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2471)
        gv3362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1322: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1321, gv3362, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1321)
        model_decoder_layers_28_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175]
        model_decoder_layers_28_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1176]
        gv3363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3363, R.dtype("float16"))
        _2471: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight4, reshape1322, model_decoder_layers_28_encoder_attn_out_proj_bias4, alloc2472)
        R.vm.kill_object(reshape1322)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias4)
        gv3364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3364, R.dtype("float16"))
        cls.add5(alloc2468, alloc2472, alloc2473)
        R.vm.kill_object(alloc2468)
        R.vm.kill_object(alloc2472)
        model_decoder_layers_28_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1183]
        model_decoder_layers_28_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1184]
        gv3365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3365, R.dtype("float16"))
        cls.layer_norm2(alloc2473, model_decoder_layers_28_final_layer_norm_weight4, model_decoder_layers_28_final_layer_norm_bias4, alloc2474)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias4)
        model_decoder_layers_28_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179]
        model_decoder_layers_28_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1180]
        gv3366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3366, R.dtype("float16"))
        _2474: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight4, alloc2474, model_decoder_layers_28_fc1_bias4, alloc2475)
        R.vm.kill_object(alloc2474)
        R.vm.kill_object(model_decoder_layers_28_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_28_fc1_bias4)
        model_decoder_layers_28_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181]
        model_decoder_layers_28_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1182]
        gv3367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3367, R.dtype("float16"))
        _2475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight4, alloc2475, model_decoder_layers_28_fc2_bias4, alloc2476)
        R.vm.kill_object(alloc2475)
        R.vm.kill_object(model_decoder_layers_28_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_28_fc2_bias4)
        gv3368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3368, R.dtype("float16"))
        cls.add5(alloc2473, alloc2476, alloc2477)
        R.vm.kill_object(alloc2473)
        R.vm.kill_object(alloc2476)
        model_decoder_layers_29_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1192]
        model_decoder_layers_29_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1193]
        gv3369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3369, R.dtype("float16"))
        cls.layer_norm2(alloc2477, model_decoder_layers_29_self_attn_layer_norm_weight4, model_decoder_layers_29_self_attn_layer_norm_bias4, alloc2478)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias4)
        model_decoder_layers_29_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188]
        model_decoder_layers_29_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1189]
        gv3370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3370, R.dtype("float16"))
        _2478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_q_proj_bias4, alloc2479)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias4)
        gv3371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1323: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2479, gv3371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2479)
        model_decoder_layers_29_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185]
        gv3372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3372, R.dtype("float16"))
        _2479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight4, alloc2478, alloc2480)
        R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight4)
        gv3373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1324: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2480, gv3373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2480)
        model_decoder_layers_29_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186]
        model_decoder_layers_29_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1187]
        gv3374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3374, R.dtype("float16"))
        _2480: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_v_proj_bias4, alloc2481)
        R.vm.kill_object(alloc2478)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias4)
        gv3375: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1325: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2481, gv3375, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2481)
        gv3376: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2482: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3376, R.dtype("float16"))
        cls.concatenate1(reshape1323, reshape1324, reshape1325, alloc2482)
        R.vm.kill_object(reshape1323)
        R.vm.kill_object(reshape1324)
        R.vm.kill_object(reshape1325)
        gv3377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1326: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2482, gv3377, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2482)
        gv3378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3378, R.dtype("float16"))
        _2482: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1326, alloc2483)
        R.vm.kill_object(reshape1326)
        gv3379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1327: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2483, gv3379, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2483)
        gv3380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1328: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1327, gv3380, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1327)
        model_decoder_layers_29_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190]
        model_decoder_layers_29_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1191]
        gv3381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3381, R.dtype("float16"))
        _2483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight4, reshape1328, model_decoder_layers_29_self_attn_out_proj_bias4, alloc2484)
        R.vm.kill_object(reshape1328)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias4)
        gv3382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3382, R.dtype("float16"))
        cls.add5(alloc2477, alloc2484, alloc2485)
        R.vm.kill_object(alloc2477)
        R.vm.kill_object(alloc2484)
        model_decoder_layers_29_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1201]
        model_decoder_layers_29_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1202]
        gv3383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3383, R.dtype("float16"))
        cls.layer_norm2(alloc2485, model_decoder_layers_29_encoder_attn_layer_norm_weight4, model_decoder_layers_29_encoder_attn_layer_norm_bias4, alloc2486)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias4)
        model_decoder_layers_29_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197]
        model_decoder_layers_29_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1198]
        gv3384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2487: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3384, R.dtype("float16"))
        _2486: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight4, alloc2486, model_decoder_layers_29_encoder_attn_q_proj_bias4, alloc2487)
        R.vm.kill_object(alloc2486)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias4)
        gv3385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1329: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2487, gv3385, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2487)
        gv3386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1330: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1329, gv3386, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1329)
        gv3387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3387, R.dtype("float16"))
        _2487: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1330, alloc2488)
        R.vm.kill_object(reshape1330)
        gv3388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1331: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2488, gv3388, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2488)
        gv3389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1332: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1331, gv3389, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1331)
        model_decoder_layers_29_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199]
        model_decoder_layers_29_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1200]
        gv3390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3390, R.dtype("float16"))
        _2488: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight4, reshape1332, model_decoder_layers_29_encoder_attn_out_proj_bias4, alloc2489)
        R.vm.kill_object(reshape1332)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias4)
        gv3391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3391, R.dtype("float16"))
        cls.add5(alloc2485, alloc2489, alloc2490)
        R.vm.kill_object(alloc2485)
        R.vm.kill_object(alloc2489)
        model_decoder_layers_29_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1207]
        model_decoder_layers_29_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1208]
        gv3392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3392, R.dtype("float16"))
        cls.layer_norm2(alloc2490, model_decoder_layers_29_final_layer_norm_weight4, model_decoder_layers_29_final_layer_norm_bias4, alloc2491)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias4)
        model_decoder_layers_29_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203]
        model_decoder_layers_29_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1204]
        gv3393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3393, R.dtype("float16"))
        _2491: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight4, alloc2491, model_decoder_layers_29_fc1_bias4, alloc2492)
        R.vm.kill_object(alloc2491)
        R.vm.kill_object(model_decoder_layers_29_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_29_fc1_bias4)
        model_decoder_layers_29_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205]
        model_decoder_layers_29_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1206]
        gv3394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3394, R.dtype("float16"))
        _2492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight4, alloc2492, model_decoder_layers_29_fc2_bias4, alloc2493)
        R.vm.kill_object(alloc2492)
        R.vm.kill_object(model_decoder_layers_29_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_29_fc2_bias4)
        gv3395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3395, R.dtype("float16"))
        cls.add5(alloc2490, alloc2493, alloc2494)
        R.vm.kill_object(alloc2490)
        R.vm.kill_object(alloc2493)
        model_decoder_layers_30_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1216]
        model_decoder_layers_30_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1217]
        gv3396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3396, R.dtype("float16"))
        cls.layer_norm2(alloc2494, model_decoder_layers_30_self_attn_layer_norm_weight4, model_decoder_layers_30_self_attn_layer_norm_bias4, alloc2495)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias4)
        model_decoder_layers_30_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212]
        model_decoder_layers_30_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1213]
        gv3397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3397, R.dtype("float16"))
        _2495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_q_proj_bias4, alloc2496)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias4)
        gv3398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1333: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2496, gv3398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2496)
        model_decoder_layers_30_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209]
        gv3399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3399, R.dtype("float16"))
        _2496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight4, alloc2495, alloc2497)
        R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight4)
        gv3400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1334: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2497, gv3400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2497)
        model_decoder_layers_30_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210]
        model_decoder_layers_30_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1211]
        gv3401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3401, R.dtype("float16"))
        _2497: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_v_proj_bias4, alloc2498)
        R.vm.kill_object(alloc2495)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias4)
        gv3402: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1335: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2498, gv3402, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2498)
        gv3403: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2499: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3403, R.dtype("float16"))
        cls.concatenate1(reshape1333, reshape1334, reshape1335, alloc2499)
        R.vm.kill_object(reshape1333)
        R.vm.kill_object(reshape1334)
        R.vm.kill_object(reshape1335)
        gv3404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1336: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2499, gv3404, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2499)
        gv3405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3405, R.dtype("float16"))
        _2499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1336, alloc2500)
        R.vm.kill_object(reshape1336)
        gv3406: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1337: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2500, gv3406, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2500)
        gv3407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1338: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1337, gv3407, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1337)
        model_decoder_layers_30_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214]
        model_decoder_layers_30_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1215]
        gv3408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3408, R.dtype("float16"))
        _2500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight4, reshape1338, model_decoder_layers_30_self_attn_out_proj_bias4, alloc2501)
        R.vm.kill_object(reshape1338)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias4)
        gv3409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3409, R.dtype("float16"))
        cls.add5(alloc2494, alloc2501, alloc2502)
        R.vm.kill_object(alloc2494)
        R.vm.kill_object(alloc2501)
        model_decoder_layers_30_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1225]
        model_decoder_layers_30_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1226]
        gv3410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3410, R.dtype("float16"))
        cls.layer_norm2(alloc2502, model_decoder_layers_30_encoder_attn_layer_norm_weight4, model_decoder_layers_30_encoder_attn_layer_norm_bias4, alloc2503)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias4)
        model_decoder_layers_30_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221]
        model_decoder_layers_30_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1222]
        gv3411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2504: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3411, R.dtype("float16"))
        _2503: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight4, alloc2503, model_decoder_layers_30_encoder_attn_q_proj_bias4, alloc2504)
        R.vm.kill_object(alloc2503)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias4)
        gv3412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1339: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2504, gv3412, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2504)
        gv3413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1340: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1339, gv3413, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1339)
        gv3414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3414, R.dtype("float16"))
        _2504: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1340, alloc2505)
        R.vm.kill_object(reshape1340)
        gv3415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1341: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2505, gv3415, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2505)
        gv3416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1342: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1341, gv3416, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1341)
        model_decoder_layers_30_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223]
        model_decoder_layers_30_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1224]
        gv3417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3417, R.dtype("float16"))
        _2505: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight4, reshape1342, model_decoder_layers_30_encoder_attn_out_proj_bias4, alloc2506)
        R.vm.kill_object(reshape1342)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias4)
        gv3418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3418, R.dtype("float16"))
        cls.add5(alloc2502, alloc2506, alloc2507)
        R.vm.kill_object(alloc2502)
        R.vm.kill_object(alloc2506)
        model_decoder_layers_30_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1231]
        model_decoder_layers_30_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1232]
        gv3419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3419, R.dtype("float16"))
        cls.layer_norm2(alloc2507, model_decoder_layers_30_final_layer_norm_weight4, model_decoder_layers_30_final_layer_norm_bias4, alloc2508)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias4)
        model_decoder_layers_30_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227]
        model_decoder_layers_30_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1228]
        gv3420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3420, R.dtype("float16"))
        _2508: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight4, alloc2508, model_decoder_layers_30_fc1_bias4, alloc2509)
        R.vm.kill_object(alloc2508)
        R.vm.kill_object(model_decoder_layers_30_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_30_fc1_bias4)
        model_decoder_layers_30_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229]
        model_decoder_layers_30_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1230]
        gv3421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3421, R.dtype("float16"))
        _2509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight4, alloc2509, model_decoder_layers_30_fc2_bias4, alloc2510)
        R.vm.kill_object(alloc2509)
        R.vm.kill_object(model_decoder_layers_30_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_30_fc2_bias4)
        gv3422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3422, R.dtype("float16"))
        cls.add5(alloc2507, alloc2510, alloc2511)
        R.vm.kill_object(alloc2507)
        R.vm.kill_object(alloc2510)
        model_decoder_layers_31_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1240]
        model_decoder_layers_31_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1241]
        gv3423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3423, R.dtype("float16"))
        cls.layer_norm2(alloc2511, model_decoder_layers_31_self_attn_layer_norm_weight4, model_decoder_layers_31_self_attn_layer_norm_bias4, alloc2512)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias4)
        model_decoder_layers_31_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236]
        model_decoder_layers_31_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1237]
        gv3424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3424, R.dtype("float16"))
        _2512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_q_proj_bias4, alloc2513)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias4)
        gv3425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1343: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2513, gv3425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2513)
        model_decoder_layers_31_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233]
        gv3426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3426, R.dtype("float16"))
        _2513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight4, alloc2512, alloc2514)
        R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight4)
        gv3427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1344: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2514, gv3427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2514)
        model_decoder_layers_31_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234]
        model_decoder_layers_31_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1235]
        gv3428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3428, R.dtype("float16"))
        _2514: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_v_proj_bias4, alloc2515)
        R.vm.kill_object(alloc2512)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight4)
        R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias4)
        gv3429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1345: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2515, gv3429, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2515)
        gv3430: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        alloc2516: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3430, R.dtype("float16"))
        cls.concatenate1(reshape1343, reshape1344, reshape1345, alloc2516)
        R.vm.kill_object(reshape1343)
        R.vm.kill_object(reshape1344)
        R.vm.kill_object(reshape1345)
        gv3431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1346: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2516, gv3431, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),))
        R.vm.kill_object(alloc2516)
        gv3432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3432, R.dtype("float16"))
        _2516: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1346, alloc2517)
        R.vm.kill_object(reshape1346)
        gv3433: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1347: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2517, gv3433, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2517)
        gv3434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1348: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1347, gv3434, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1347)
        model_decoder_layers_31_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238]
        model_decoder_layers_31_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1239]
        gv3435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3435, R.dtype("float16"))
        _2517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight4, reshape1348, model_decoder_layers_31_self_attn_out_proj_bias4, alloc2518)
        R.vm.kill_object(reshape1348)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias4)
        gv3436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3436, R.dtype("float16"))
        cls.add5(alloc2511, alloc2518, alloc2519)
        R.vm.kill_object(alloc2511)
        R.vm.kill_object(alloc2518)
        model_decoder_layers_31_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1249]
        model_decoder_layers_31_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1250]
        gv3437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3437, R.dtype("float16"))
        cls.layer_norm2(alloc2519, model_decoder_layers_31_encoder_attn_layer_norm_weight4, model_decoder_layers_31_encoder_attn_layer_norm_bias4, alloc2520)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias4)
        model_decoder_layers_31_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245]
        model_decoder_layers_31_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1246]
        gv3438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2521: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3438, R.dtype("float16"))
        _2520: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight4, alloc2520, model_decoder_layers_31_encoder_attn_q_proj_bias4, alloc2521)
        R.vm.kill_object(alloc2520)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight4)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias4)
        gv3439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1349: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2521, gv3439, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2521)
        gv3440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        reshape1350: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1349, gv3440, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(reshape1349)
        gv3441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),))
        alloc2522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3441, R.dtype("float16"))
        _2521: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1350, alloc2522)
        R.vm.kill_object(reshape1350)
        gv3442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),))
        reshape1351: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2522, gv3442, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),))
        R.vm.kill_object(alloc2522)
        gv3443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        reshape1352: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1351, gv3443, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),))
        R.vm.kill_object(reshape1351)
        model_decoder_layers_31_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247]
        model_decoder_layers_31_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1248]
        gv3444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3444, R.dtype("float16"))
        _2522: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight4, reshape1352, model_decoder_layers_31_encoder_attn_out_proj_bias4, alloc2523)
        R.vm.kill_object(reshape1352)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight4)
        R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias4)
        gv3445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3445, R.dtype("float16"))
        R.vm.kill_object(storage39)
        cls.add5(alloc2519, alloc2523, alloc2524)
        R.vm.kill_object(alloc2519)
        R.vm.kill_object(alloc2523)
        model_decoder_layers_31_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1255]
        model_decoder_layers_31_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1256]
        gv3446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3446, R.dtype("float16"))
        cls.layer_norm2(alloc2524, model_decoder_layers_31_final_layer_norm_weight4, model_decoder_layers_31_final_layer_norm_bias4, alloc2525)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias4)
        model_decoder_layers_31_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251]
        model_decoder_layers_31_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1252]
        gv3447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),))
        alloc2526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3447, R.dtype("float16"))
        R.vm.kill_object(storage37)
        _2525: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight4, alloc2525, model_decoder_layers_31_fc1_bias4, alloc2526)
        R.vm.kill_object(alloc2525)
        R.vm.kill_object(model_decoder_layers_31_fc1_weight4)
        R.vm.kill_object(model_decoder_layers_31_fc1_bias4)
        model_decoder_layers_31_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253]
        model_decoder_layers_31_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1254]
        gv3448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3448, R.dtype("float16"))
        R.vm.kill_object(storage38)
        _2526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight4, alloc2526, model_decoder_layers_31_fc2_bias4, alloc2527)
        R.vm.kill_object(alloc2526)
        R.vm.kill_object(model_decoder_layers_31_fc2_weight4)
        R.vm.kill_object(model_decoder_layers_31_fc2_bias4)
        gv3449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3449, R.dtype("float16"))
        R.vm.kill_object(storage40)
        cls.add5(alloc2524, alloc2527, alloc2528)
        R.vm.kill_object(alloc2524)
        R.vm.kill_object(alloc2527)
        model_decoder_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1257]
        model_decoder_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1258]
        gv3450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),))
        alloc2529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3450, R.dtype("float16"))
        R.vm.kill_object(storage41)
        cls.layer_norm2(alloc2528, model_decoder_layer_norm_weight4, model_decoder_layer_norm_bias4, alloc2529)
        R.vm.kill_object(alloc2528)
        R.vm.kill_object(model_decoder_layer_norm_weight4)
        R.vm.kill_object(model_decoder_layer_norm_bias4)
        storage42: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc2530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage42, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16"))
        R.vm.kill_object(storage42)
        cls.index(alloc2529, alloc2530)
        R.vm.kill_object(alloc2529)
        storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        alloc2531: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32"))
        R.vm.kill_object(storage)
        _2530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul2_cublas", model_decoder_embed_tokens_weight4, alloc2530, alloc2531)
        R.vm.kill_object(model_decoder_embed_tokens_weight4)
        R.vm.kill_object(alloc2530)
        return alloc2531

    @R.function
    def renormalize_by_top_p(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), top_p: R.Tensor(("batch_size",), dtype="float32"), init_pivots: R.Tensor(("batch_size", 3), dtype="float32")) -> R.Tensor(("batch_size", "vocab_size"), dtype="float32"):
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", init_pivots, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", init_pivots, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(3), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func4(shape_heap)
        storage43: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3451: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),))
        alloc2532: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage43, R.prim_value(0), gv3451, R.dtype("float32"))
        R.vm.kill_object(storage43)
        storage44: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3452: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),))
        alloc2533: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage44, R.prim_value(0), gv3452, R.dtype("float32"))
        R.vm.kill_object(storage44)
        cls.top_p_pivot_cutoff(probs, top_p, init_pivots, alloc2532, alloc2533)
        lv6: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1)) = alloc2532, alloc2533
        gv3453: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        storage45: R.Object = R.vm.alloc_storage(gv3453, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3454: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc2534: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage45, R.prim_value(0), gv3454, R.dtype("float32"))
        R.vm.kill_object(storage45)
        cls.top_p_renorm_after_cutoff(probs, alloc2532, alloc2533, alloc2534)
        R.vm.kill_object(alloc2532)
        R.vm.kill_object(alloc2533)
        R.call_packed("vm.builtin.match_shape", alloc2534, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=return, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        return alloc2534

    @R.function
    def sample_with_top_p(sorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), top_p: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("num_samples",), dtype="int32"):
        num_samples = T.int64()
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(6),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", sorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func3(shape_heap)
        gv2568: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        uniform_samples1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv2568, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),))
        gv2569: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        sample_indices1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv2569, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),))
        gv2570: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        sample_indices2: R.Tensor((batch_size, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", top_p, gv2570, sinfo_args=(R.Tensor((batch_size, 1), dtype="float32"),))
        storage33: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2571: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1978: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2571, R.dtype("int32"))
        gv2572: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=1),))
        R.call_packed("vm.builtin.call_tir_dyn", cls.full, alloc1978, gv2572, sinfo_args=(R.Tuple,))
        gv2573: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),))
        storage34: R.Object = R.vm.alloc_storage(gv2573, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2574: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),))
        lv1: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage34, R.prim_value(0), gv2574, R.dtype("uint8"))
        R.vm.kill_object(storage34)
        gv2575: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(5), sinfo_args=(R.Shape(ndim=1),))
        storage35: R.Object = R.vm.alloc_storage(gv2575, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2576: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1979: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage35, R.prim_value(0), gv2576, R.dtype("float32"))
        R.vm.kill_object(storage35)
        cls.cumsum(sorted_probs, lv1, alloc1979)
        R.vm.kill_object(lv1)
        storage36: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2577: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1980: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage36, R.prim_value(0), gv2577, R.dtype("float32"))
        R.vm.kill_object(storage36)
        cls.get_renorm_prob(alloc1979, sample_indices2, alloc1978, alloc1980)
        R.vm.kill_object(sample_indices2)
        R.vm.kill_object(alloc1978)
        gv2578: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc1981: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2578, R.dtype("int32"))
        R.vm.kill_object(storage33)
        cls.get_index_from_sorted(alloc1979, sorted_indices, alloc1980, uniform_samples1, sample_indices1, alloc1981)
        R.vm.kill_object(uniform_samples1)
        R.vm.kill_object(sample_indices1)
        R.vm.kill_object(alloc1979)
        R.vm.kill_object(alloc1980)
        gv2579: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        gv2: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc1981, gv2579, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),))
        R.vm.kill_object(alloc1981)
        return gv2

    @R.function
    def sampler_take_probs(unsorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), sampling_result: R.Tensor(("num_samples",), dtype="int32"), lobprob_offsets: R.Tensor(("num_positions",), dtype="int32")) -> R.Tuple(R.Tensor(("num_samples",), dtype="float32"), R.Tensor(("num_positions",), dtype="float32"), R.Tensor(("num_positions",), dtype="int32")):
        num_samples = T.int64()
        num_positions = T.int64()
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(4),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", unsorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", sampling_result, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", lobprob_offsets, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", unsorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", sampling_result, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", lobprob_offsets, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        storage: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        alloc: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage, R.prim_value(0), gv, R.dtype("float32"))
        R.vm.kill_object(storage)
        storage1: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv1: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),))
        alloc1: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage1, R.prim_value(0), gv1, R.dtype("float32"))
        R.vm.kill_object(storage1)
        storage2: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv2: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),))
        alloc2: R.Tensor(dtype="int32", ndim=1) = R.vm.alloc_tensor(storage2, R.prim_value(0), gv2, R.dtype("int32"))
        R.vm.kill_object(storage2)
        cls.sampler_take_probs_tir(unsorted_probs, sorted_indices, sample_indices, sampling_result, lobprob_offsets, alloc, alloc1, alloc2)
        gv3: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="int32", ndim=1)) = alloc, alloc1, alloc2
        R.vm.kill_object(alloc)
        R.vm.kill_object(alloc1)
        R.vm.kill_object(alloc2)
        gv3_1: R.Tensor(dtype="float32", ndim=1) = gv3[0]
        R.call_packed("vm.builtin.match_shape", gv3_1, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,))
        gv4: R.Tensor(dtype="float32", ndim=1) = gv3[1]
        R.call_packed("vm.builtin.match_shape", gv4, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,))
        gv5: R.Tensor(dtype="int32", ndim=1) = gv3[2]
        R.call_packed("vm.builtin.match_shape", gv5, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,))
        return gv3

    @R.function
    def sampler_verify_draft_tokens(draft_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), draft_tokens: R.Tensor(("num_nodes",), dtype="int32"), model_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), token_tree_first_child: R.Tensor(("num_nodes",), dtype="int32"), token_tree_next_sibling: R.Tensor(("num_nodes",), dtype="int32"), uniform_samples: R.Tensor(("num_nodes",), dtype="float32"), token_tree_parent_ptr: R.Tensor(("nbatch",), dtype="int32")) -> R.Tuple(R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), R.Tensor(("nbatch",), dtype="int32")):
        num_nodes = T.int64()
        vocab_size = T.int64()
        nbatch = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", draft_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", draft_tokens, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", model_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", token_tree_first_child, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", token_tree_next_sibling, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", token_tree_parent_ptr, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", draft_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", draft_tokens, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", model_probs, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", token_tree_first_child, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", token_tree_next_sibling, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", token_tree_parent_ptr, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,))
        cls.batch_verify_on_gpu_single_kernel(draft_probs, draft_tokens, model_probs, token_tree_first_child, token_tree_next_sibling, uniform_samples, token_tree_parent_ptr)
        gv4: R.Tuple(R.Tensor((num_nodes, vocab_size), dtype="float32"), R.Tensor((nbatch,), dtype="int32")) = model_probs, token_tree_parent_ptr
        return gv4

    @R.function
    def softmax_with_temperature(logits: R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"), temperature: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"):
        batch_size = T.int64()
        vocab_size = T.int64()
        R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}})
        cls = Module
        shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),))
        R.call_packed("vm.builtin.check_tensor_info", logits, R.prim_value(3), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.check_tensor_info", temperature, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", logits, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        R.call_packed("vm.builtin.match_shape", temperature, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,))
        cls.shape_func5(shape_heap)
        gv3455: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        lv: R.Tensor((batch_size, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", logits, gv3455, sinfo_args=(R.Tensor((batch_size, vocab_size), dtype="float32"),))
        gv3456: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        storage46: R.Object = R.vm.alloc_storage(gv3456, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3457: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),))
        alloc2535: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage46, R.prim_value(0), gv3457, R.dtype("float32"))
        R.vm.kill_object(storage46)
        gv3458: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),))
        storage47: R.Object = R.vm.alloc_storage(gv3458, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3459: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),))
        alloc2536: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage47, R.prim_value(0), gv3459, R.dtype("float32"))
        R.vm.kill_object(storage47)
        cls.chunk_lse(lv, temperature, alloc2535, alloc2536)
        lv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="float32", ndim=2)) = alloc2535, alloc2536
        gv3460: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),))
        storage48: R.Object = R.vm.alloc_storage(gv3460, R.prim_value(0), R.dtype("uint8"), R.str("global"))
        gv3461: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),))
        alloc2537: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage48, R.prim_value(0), gv3461, R.dtype("float32"))
        R.vm.kill_object(storage48)
        cls.softmax_with_chunked_sum(lv, temperature, alloc2535, alloc2536, alloc2537)
        R.vm.kill_object(lv)
        R.vm.kill_object(alloc2535)
        R.vm.kill_object(alloc2536)
        gv3462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=3),))
        gv: R.Tensor((batch_size, 1, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", alloc2537, gv3462, sinfo_args=(R.Tensor((batch_size, 1, vocab_size), dtype="float32"),))
        R.vm.kill_object(alloc2537)
        return gv

# Metadata omitted. Use show_meta=True in script() method to show it.